• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 **************************************************************************
5 *   Copyright (C) 2002-2016 International Business Machines Corporation
6 *   and others. All rights reserved.
7 **************************************************************************
8 */
9 //
10 //  file:  rematch.cpp
11 //
12 //         Contains the implementation of class RegexMatcher,
13 //         which is one of the main API classes for the ICU regular expression package.
14 //
15 
16 #include "unicode/utypes.h"
17 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
18 
19 #include "unicode/regex.h"
20 #include "unicode/uniset.h"
21 #include "unicode/uchar.h"
22 #include "unicode/ustring.h"
23 #include "unicode/rbbi.h"
24 #include "unicode/utf.h"
25 #include "unicode/utf16.h"
26 #include "uassert.h"
27 #include "cmemory.h"
28 #include "cstr.h"
29 #include "uvector.h"
30 #include "uvectr32.h"
31 #include "uvectr64.h"
32 #include "regeximp.h"
33 #include "regexst.h"
34 #include "regextxt.h"
35 #include "ucase.h"
36 
37 // #include <malloc.h>        // Needed for heapcheck testing
38 
39 
40 U_NAMESPACE_BEGIN
41 
42 // Default limit for the size of the back track stack, to avoid system
43 //    failures causedby heap exhaustion.  Units are in 32 bit words, not bytes.
44 // This value puts ICU's limits higher than most other regexp implementations,
45 //    which use recursion rather than the heap, and take more storage per
46 //    backtrack point.
47 //
48 static const int32_t DEFAULT_BACKTRACK_STACK_CAPACITY = 8000000;
49 
50 // Time limit counter constant.
51 //   Time limits for expression evaluation are in terms of quanta of work by
52 //   the engine, each of which is 10,000 state saves.
53 //   This constant determines that state saves per tick number.
54 static const int32_t TIMER_INITIAL_VALUE = 10000;
55 
56 
57 // Test for any of the Unicode line terminating characters.
isLineTerminator(UChar32 c)58 static inline UBool isLineTerminator(UChar32 c) {
59     if (c & ~(0x0a | 0x0b | 0x0c | 0x0d | 0x85 | 0x2028 | 0x2029)) {
60         return false;
61     }
62     return (c<=0x0d && c>=0x0a) || c==0x85 || c==0x2028 || c==0x2029;
63 }
64 
65 //-----------------------------------------------------------------------------
66 //
67 //   Constructor and Destructor
68 //
69 //-----------------------------------------------------------------------------
RegexMatcher(const RegexPattern * pat)70 RegexMatcher::RegexMatcher(const RegexPattern *pat)  {
71     fDeferredStatus = U_ZERO_ERROR;
72     init(fDeferredStatus);
73     if (U_FAILURE(fDeferredStatus)) {
74         return;
75     }
76     if (pat==nullptr) {
77         fDeferredStatus = U_ILLEGAL_ARGUMENT_ERROR;
78         return;
79     }
80     fPattern = pat;
81     init2(RegexStaticSets::gStaticSets->fEmptyText, fDeferredStatus);
82 }
83 
84 
85 
RegexMatcher(const UnicodeString & regexp,const UnicodeString & input,uint32_t flags,UErrorCode & status)86 RegexMatcher::RegexMatcher(const UnicodeString &regexp, const UnicodeString &input,
87                            uint32_t flags, UErrorCode &status) {
88     init(status);
89     if (U_FAILURE(status)) {
90         return;
91     }
92     UParseError    pe;
93     fPatternOwned      = RegexPattern::compile(regexp, flags, pe, status);
94     fPattern           = fPatternOwned;
95 
96     UText inputText = UTEXT_INITIALIZER;
97     utext_openConstUnicodeString(&inputText, &input, &status);
98     init2(&inputText, status);
99     utext_close(&inputText);
100 
101     fInputUniStrMaybeMutable = true;
102 }
103 
104 
RegexMatcher(UText * regexp,UText * input,uint32_t flags,UErrorCode & status)105 RegexMatcher::RegexMatcher(UText *regexp, UText *input,
106                            uint32_t flags, UErrorCode &status) {
107     init(status);
108     if (U_FAILURE(status)) {
109         return;
110     }
111     UParseError    pe;
112     fPatternOwned      = RegexPattern::compile(regexp, flags, pe, status);
113     if (U_FAILURE(status)) {
114         return;
115     }
116 
117     fPattern           = fPatternOwned;
118     init2(input, status);
119 }
120 
121 
RegexMatcher(const UnicodeString & regexp,uint32_t flags,UErrorCode & status)122 RegexMatcher::RegexMatcher(const UnicodeString &regexp,
123                            uint32_t flags, UErrorCode &status) {
124     init(status);
125     if (U_FAILURE(status)) {
126         return;
127     }
128     UParseError    pe;
129     fPatternOwned      = RegexPattern::compile(regexp, flags, pe, status);
130     if (U_FAILURE(status)) {
131         return;
132     }
133     fPattern           = fPatternOwned;
134     init2(RegexStaticSets::gStaticSets->fEmptyText, status);
135 }
136 
RegexMatcher(UText * regexp,uint32_t flags,UErrorCode & status)137 RegexMatcher::RegexMatcher(UText *regexp,
138                            uint32_t flags, UErrorCode &status) {
139     init(status);
140     if (U_FAILURE(status)) {
141         return;
142     }
143     UParseError    pe;
144     fPatternOwned      = RegexPattern::compile(regexp, flags, pe, status);
145         if (U_FAILURE(status)) {
146         return;
147     }
148 
149     fPattern           = fPatternOwned;
150     init2(RegexStaticSets::gStaticSets->fEmptyText, status);
151 }
152 
153 
154 
155 
~RegexMatcher()156 RegexMatcher::~RegexMatcher() {
157     delete fStack;
158     if (fData != fSmallData) {
159         uprv_free(fData);
160         fData = nullptr;
161     }
162     if (fPatternOwned) {
163         delete fPatternOwned;
164         fPatternOwned = nullptr;
165         fPattern = nullptr;
166     }
167 
168     delete fInput;
169     if (fInputText) {
170         utext_close(fInputText);
171     }
172     if (fAltInputText) {
173         utext_close(fAltInputText);
174     }
175 
176     #if UCONFIG_NO_BREAK_ITERATION==0
177     delete fWordBreakItr;
178     delete fGCBreakItr;
179     #endif
180 }
181 
182 //
183 //   init()   common initialization for use by all constructors.
184 //            Initialize all fields, get the object into a consistent state.
185 //            This must be done even when the initial status shows an error,
186 //            so that the object is initialized sufficiently well for the destructor
187 //            to run safely.
188 //
init(UErrorCode & status)189 void RegexMatcher::init(UErrorCode &status) {
190     fPattern           = nullptr;
191     fPatternOwned      = nullptr;
192     fFrameSize         = 0;
193     fRegionStart       = 0;
194     fRegionLimit       = 0;
195     fAnchorStart       = 0;
196     fAnchorLimit       = 0;
197     fLookStart         = 0;
198     fLookLimit         = 0;
199     fActiveStart       = 0;
200     fActiveLimit       = 0;
201     fTransparentBounds = false;
202     fAnchoringBounds   = true;
203     fMatch             = false;
204     fMatchStart        = 0;
205     fMatchEnd          = 0;
206     fLastMatchEnd      = -1;
207     fAppendPosition    = 0;
208     fHitEnd            = false;
209     fRequireEnd        = false;
210     fStack             = nullptr;
211     fFrame             = nullptr;
212     fTimeLimit         = 0;
213     fTime              = 0;
214     fTickCounter       = 0;
215     fStackLimit        = DEFAULT_BACKTRACK_STACK_CAPACITY;
216     fCallbackFn        = nullptr;
217     fCallbackContext   = nullptr;
218     fFindProgressCallbackFn      = nullptr;
219     fFindProgressCallbackContext = nullptr;
220     fTraceDebug        = false;
221     fDeferredStatus    = status;
222     fData              = fSmallData;
223     fWordBreakItr      = nullptr;
224     fGCBreakItr        = nullptr;
225 
226     fStack             = nullptr;
227     fInputText         = nullptr;
228     fAltInputText      = nullptr;
229     fInput             = nullptr;
230     fInputLength       = 0;
231     fInputUniStrMaybeMutable = false;
232 }
233 
234 //
235 //  init2()   Common initialization for use by RegexMatcher constructors, part 2.
236 //            This handles the common setup to be done after the Pattern is available.
237 //
init2(UText * input,UErrorCode & status)238 void RegexMatcher::init2(UText *input, UErrorCode &status) {
239     if (U_FAILURE(status)) {
240         fDeferredStatus = status;
241         return;
242     }
243 
244     if (fPattern->fDataSize > UPRV_LENGTHOF(fSmallData)) {
245         fData = (int64_t *)uprv_malloc(fPattern->fDataSize * sizeof(int64_t));
246         if (fData == nullptr) {
247             status = fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
248             return;
249         }
250     }
251 
252     fStack = new UVector64(status);
253     if (fStack == nullptr) {
254         status = fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
255         return;
256     }
257 
258     reset(input);
259     setStackLimit(DEFAULT_BACKTRACK_STACK_CAPACITY, status);
260     if (U_FAILURE(status)) {
261         fDeferredStatus = status;
262         return;
263     }
264 }
265 
266 
267 static const char16_t BACKSLASH  = 0x5c;
268 static const char16_t DOLLARSIGN = 0x24;
269 static const char16_t LEFTBRACKET = 0x7b;
270 static const char16_t RIGHTBRACKET = 0x7d;
271 
272 //--------------------------------------------------------------------------------
273 //
274 //    appendReplacement
275 //
276 //--------------------------------------------------------------------------------
appendReplacement(UnicodeString & dest,const UnicodeString & replacement,UErrorCode & status)277 RegexMatcher &RegexMatcher::appendReplacement(UnicodeString &dest,
278                                               const UnicodeString &replacement,
279                                               UErrorCode &status) {
280     UText replacementText = UTEXT_INITIALIZER;
281 
282     utext_openConstUnicodeString(&replacementText, &replacement, &status);
283     if (U_SUCCESS(status)) {
284         UText resultText = UTEXT_INITIALIZER;
285         utext_openUnicodeString(&resultText, &dest, &status);
286 
287         if (U_SUCCESS(status)) {
288             appendReplacement(&resultText, &replacementText, status);
289             utext_close(&resultText);
290         }
291         utext_close(&replacementText);
292     }
293 
294     return *this;
295 }
296 
297 //
298 //    appendReplacement, UText mode
299 //
appendReplacement(UText * dest,UText * replacement,UErrorCode & status)300 RegexMatcher &RegexMatcher::appendReplacement(UText *dest,
301                                               UText *replacement,
302                                               UErrorCode &status) {
303     if (U_FAILURE(status)) {
304         return *this;
305     }
306     if (U_FAILURE(fDeferredStatus)) {
307         status = fDeferredStatus;
308         return *this;
309     }
310     if (fMatch == false) {
311         status = U_REGEX_INVALID_STATE;
312         return *this;
313     }
314 
315     // Copy input string from the end of previous match to start of current match
316     int64_t  destLen = utext_nativeLength(dest);
317     if (fMatchStart > fAppendPosition) {
318         if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
319             destLen += utext_replace(dest, destLen, destLen, fInputText->chunkContents+fAppendPosition,
320                                      (int32_t)(fMatchStart-fAppendPosition), &status);
321         } else {
322             int32_t len16;
323             if (UTEXT_USES_U16(fInputText)) {
324                 len16 = (int32_t)(fMatchStart-fAppendPosition);
325             } else {
326                 UErrorCode lengthStatus = U_ZERO_ERROR;
327                 len16 = utext_extract(fInputText, fAppendPosition, fMatchStart, nullptr, 0, &lengthStatus);
328             }
329             char16_t *inputChars = (char16_t *)uprv_malloc(sizeof(char16_t)*(len16+1));
330             if (inputChars == nullptr) {
331                 status = U_MEMORY_ALLOCATION_ERROR;
332                 return *this;
333             }
334             utext_extract(fInputText, fAppendPosition, fMatchStart, inputChars, len16+1, &status);
335             destLen += utext_replace(dest, destLen, destLen, inputChars, len16, &status);
336             uprv_free(inputChars);
337         }
338     }
339     fAppendPosition = fMatchEnd;
340 
341 
342     // scan the replacement text, looking for substitutions ($n) and \escapes.
343     //  TODO:  optimize this loop by efficiently scanning for '$' or '\',
344     //         move entire ranges not containing substitutions.
345     UTEXT_SETNATIVEINDEX(replacement, 0);
346     for (UChar32 c = UTEXT_NEXT32(replacement); U_SUCCESS(status) && c != U_SENTINEL;  c = UTEXT_NEXT32(replacement)) {
347         if (c == BACKSLASH) {
348             // Backslash Escape.  Copy the following char out without further checks.
349             //                    Note:  Surrogate pairs don't need any special handling
350             //                           The second half wont be a '$' or a '\', and
351             //                           will move to the dest normally on the next
352             //                           loop iteration.
353             c = UTEXT_CURRENT32(replacement);
354             if (c == U_SENTINEL) {
355                 break;
356             }
357 
358             if (c==0x55/*U*/ || c==0x75/*u*/) {
359                 // We have a \udddd or \Udddddddd escape sequence.
360                 int32_t offset = 0;
361                 struct URegexUTextUnescapeCharContext context = U_REGEX_UTEXT_UNESCAPE_CONTEXT(replacement);
362                 UChar32 escapedChar = u_unescapeAt(uregex_utext_unescape_charAt, &offset, INT32_MAX, &context);
363                 if (escapedChar != (UChar32)0xFFFFFFFF) {
364                     if (U_IS_BMP(escapedChar)) {
365                         char16_t c16 = (char16_t)escapedChar;
366                         destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
367                     } else {
368                         char16_t surrogate[2];
369                         surrogate[0] = U16_LEAD(escapedChar);
370                         surrogate[1] = U16_TRAIL(escapedChar);
371                         if (U_SUCCESS(status)) {
372                             destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);
373                         }
374                     }
375                     // TODO:  Report errors for mal-formed \u escapes?
376                     //        As this is, the original sequence is output, which may be OK.
377                     if (context.lastOffset == offset) {
378                         (void)UTEXT_PREVIOUS32(replacement);
379                     } else if (context.lastOffset != offset-1) {
380                         utext_moveIndex32(replacement, offset - context.lastOffset - 1);
381                     }
382                 }
383             } else {
384                 (void)UTEXT_NEXT32(replacement);
385                 // Plain backslash escape.  Just put out the escaped character.
386                 if (U_IS_BMP(c)) {
387                     char16_t c16 = (char16_t)c;
388                     destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
389                 } else {
390                     char16_t surrogate[2];
391                     surrogate[0] = U16_LEAD(c);
392                     surrogate[1] = U16_TRAIL(c);
393                     if (U_SUCCESS(status)) {
394                         destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);
395                     }
396                 }
397             }
398         } else if (c != DOLLARSIGN) {
399             // Normal char, not a $.  Copy it out without further checks.
400             if (U_IS_BMP(c)) {
401                 char16_t c16 = (char16_t)c;
402                 destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
403             } else {
404                 char16_t surrogate[2];
405                 surrogate[0] = U16_LEAD(c);
406                 surrogate[1] = U16_TRAIL(c);
407                 if (U_SUCCESS(status)) {
408                     destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);
409                 }
410             }
411         } else {
412             // We've got a $.  Pick up a capture group name or number if one follows.
413             // Consume digits so long as the resulting group number <= the number of
414             // number of capture groups in the pattern.
415 
416             int32_t groupNum  = 0;
417             int32_t numDigits = 0;
418             UChar32 nextChar = utext_current32(replacement);
419             if (nextChar == LEFTBRACKET) {
420                 // Scan for a Named Capture Group, ${name}.
421                 UnicodeString groupName;
422                 utext_next32(replacement);
423                 while(U_SUCCESS(status) && nextChar != RIGHTBRACKET) {
424                     nextChar = utext_next32(replacement);
425                     if (nextChar == U_SENTINEL) {
426                         status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
427                     } else if ((nextChar >= 0x41 && nextChar <= 0x5a) ||       // A..Z
428                                (nextChar >= 0x61 && nextChar <= 0x7a) ||       // a..z
429                                (nextChar >= 0x31 && nextChar <= 0x39)) {       // 0..9
430                         groupName.append(nextChar);
431                     } else if (nextChar == RIGHTBRACKET) {
432                         groupNum = fPattern->fNamedCaptureMap ? uhash_geti(fPattern->fNamedCaptureMap, &groupName) : 0;
433                         if (groupNum == 0) {
434                             status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
435                         }
436                     } else {
437                         // Character was something other than a name char or a closing '}'
438                         status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
439                     }
440                 }
441 
442             } else if (u_isdigit(nextChar)) {
443                 // $n    Scan for a capture group number
444                 int32_t numCaptureGroups = fPattern->fGroupMap->size();
445                 for (;;) {
446                     nextChar = UTEXT_CURRENT32(replacement);
447                     if (nextChar == U_SENTINEL) {
448                         break;
449                     }
450                     if (u_isdigit(nextChar) == false) {
451                         break;
452                     }
453                     int32_t nextDigitVal = u_charDigitValue(nextChar);
454                     if (groupNum*10 + nextDigitVal > numCaptureGroups) {
455                         // Don't consume the next digit if it makes the capture group number too big.
456                         if (numDigits == 0) {
457                             status = U_INDEX_OUTOFBOUNDS_ERROR;
458                         }
459                         break;
460                     }
461                     (void)UTEXT_NEXT32(replacement);
462                     groupNum=groupNum*10 + nextDigitVal;
463                     ++numDigits;
464                 }
465             } else {
466                 // $ not followed by capture group name or number.
467                 status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
468             }
469 
470             if (U_SUCCESS(status)) {
471                 destLen += appendGroup(groupNum, dest, status);
472             }
473         }  // End of $ capture group handling
474     }  // End of per-character loop through the replacement string.
475 
476     return *this;
477 }
478 
479 
480 
481 //--------------------------------------------------------------------------------
482 //
483 //    appendTail     Intended to be used in conjunction with appendReplacement()
484 //                   To the destination string, append everything following
485 //                   the last match position from the input string.
486 //
487 //                   Note:  Match ranges do not affect appendTail or appendReplacement
488 //
489 //--------------------------------------------------------------------------------
appendTail(UnicodeString & dest)490 UnicodeString &RegexMatcher::appendTail(UnicodeString &dest) {
491     UErrorCode status = U_ZERO_ERROR;
492     UText resultText = UTEXT_INITIALIZER;
493     utext_openUnicodeString(&resultText, &dest, &status);
494 
495     if (U_SUCCESS(status)) {
496         appendTail(&resultText, status);
497         utext_close(&resultText);
498     }
499 
500     return dest;
501 }
502 
503 //
504 //   appendTail, UText mode
505 //
appendTail(UText * dest,UErrorCode & status)506 UText *RegexMatcher::appendTail(UText *dest, UErrorCode &status) {
507     if (U_FAILURE(status)) {
508         return dest;
509     }
510     if (U_FAILURE(fDeferredStatus)) {
511         status = fDeferredStatus;
512         return dest;
513     }
514 
515     if (fInputLength > fAppendPosition) {
516         if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
517             int64_t destLen = utext_nativeLength(dest);
518             utext_replace(dest, destLen, destLen, fInputText->chunkContents+fAppendPosition,
519                           (int32_t)(fInputLength-fAppendPosition), &status);
520         } else {
521             int32_t len16;
522             if (UTEXT_USES_U16(fInputText)) {
523                 len16 = (int32_t)(fInputLength-fAppendPosition);
524             } else {
525                 len16 = utext_extract(fInputText, fAppendPosition, fInputLength, nullptr, 0, &status);
526                 status = U_ZERO_ERROR; // buffer overflow
527             }
528 
529             char16_t *inputChars = (char16_t *)uprv_malloc(sizeof(char16_t)*(len16));
530             if (inputChars == nullptr) {
531                 fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
532             } else {
533                 utext_extract(fInputText, fAppendPosition, fInputLength, inputChars, len16, &status); // unterminated
534                 int64_t destLen = utext_nativeLength(dest);
535                 utext_replace(dest, destLen, destLen, inputChars, len16, &status);
536                 uprv_free(inputChars);
537             }
538         }
539     }
540     return dest;
541 }
542 
543 
544 
545 //--------------------------------------------------------------------------------
546 //
547 //   end
548 //
549 //--------------------------------------------------------------------------------
end(UErrorCode & err) const550 int32_t RegexMatcher::end(UErrorCode &err) const {
551     return end(0, err);
552 }
553 
end64(UErrorCode & err) const554 int64_t RegexMatcher::end64(UErrorCode &err) const {
555     return end64(0, err);
556 }
557 
end64(int32_t group,UErrorCode & err) const558 int64_t RegexMatcher::end64(int32_t group, UErrorCode &err) const {
559     if (U_FAILURE(err)) {
560         return -1;
561     }
562     if (fMatch == false) {
563         err = U_REGEX_INVALID_STATE;
564         return -1;
565     }
566     if (group < 0 || group > fPattern->fGroupMap->size()) {
567         err = U_INDEX_OUTOFBOUNDS_ERROR;
568         return -1;
569     }
570     int64_t e = -1;
571     if (group == 0) {
572         e = fMatchEnd;
573     } else {
574         // Get the position within the stack frame of the variables for
575         //    this capture group.
576         int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1);
577         U_ASSERT(groupOffset < fPattern->fFrameSize);
578         U_ASSERT(groupOffset >= 0);
579         e = fFrame->fExtra[groupOffset + 1];
580     }
581 
582         return e;
583 }
584 
end(int32_t group,UErrorCode & err) const585 int32_t RegexMatcher::end(int32_t group, UErrorCode &err) const {
586     return (int32_t)end64(group, err);
587 }
588 
589 //--------------------------------------------------------------------------------
590 //
591 //   findProgressInterrupt  This function is called once for each advance in the target
592 //                          string from the find() function, and calls the user progress callback
593 //                          function if there is one installed.
594 //
595 //         Return:  true if the find operation is to be terminated.
596 //                  false if the find operation is to continue running.
597 //
598 //--------------------------------------------------------------------------------
findProgressInterrupt(int64_t pos,UErrorCode & status)599 UBool RegexMatcher::findProgressInterrupt(int64_t pos, UErrorCode &status) {
600     if (fFindProgressCallbackFn && !(*fFindProgressCallbackFn)(fFindProgressCallbackContext, pos)) {
601         status = U_REGEX_STOPPED_BY_CALLER;
602         return true;
603     }
604     return false;
605 }
606 
607 //--------------------------------------------------------------------------------
608 //
609 //   find()
610 //
611 //--------------------------------------------------------------------------------
find()612 UBool RegexMatcher::find() {
613     if (U_FAILURE(fDeferredStatus)) {
614         return false;
615     }
616     UErrorCode status = U_ZERO_ERROR;
617     UBool result = find(status);
618     return result;
619 }
620 
621 //--------------------------------------------------------------------------------
622 //
623 //   find()
624 //
625 //--------------------------------------------------------------------------------
find(UErrorCode & status)626 UBool RegexMatcher::find(UErrorCode &status) {
627     // Start at the position of the last match end.  (Will be zero if the
628     //   matcher has been reset.)
629     //
630     if (U_FAILURE(status)) {
631         return false;
632     }
633     if (U_FAILURE(fDeferredStatus)) {
634         status = fDeferredStatus;
635         return false;
636     }
637 
638     if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
639         return findUsingChunk(status);
640     }
641 
642     int64_t startPos = fMatchEnd;
643     if (startPos==0) {
644         startPos = fActiveStart;
645     }
646 
647     if (fMatch) {
648         // Save the position of any previous successful match.
649         fLastMatchEnd = fMatchEnd;
650 
651         if (fMatchStart == fMatchEnd) {
652             // Previous match had zero length.  Move start position up one position
653             //  to avoid sending find() into a loop on zero-length matches.
654             if (startPos >= fActiveLimit) {
655                 fMatch = false;
656                 fHitEnd = true;
657                 return false;
658             }
659             UTEXT_SETNATIVEINDEX(fInputText, startPos);
660             (void)UTEXT_NEXT32(fInputText);
661             startPos = UTEXT_GETNATIVEINDEX(fInputText);
662         }
663     } else {
664         if (fLastMatchEnd >= 0) {
665             // A previous find() failed to match.  Don't try again.
666             //   (without this test, a pattern with a zero-length match
667             //    could match again at the end of an input string.)
668             fHitEnd = true;
669             return false;
670         }
671     }
672 
673 
674     // Compute the position in the input string beyond which a match can not begin, because
675     //   the minimum length match would extend past the end of the input.
676     //   Note:  some patterns that cannot match anything will have fMinMatchLength==Max Int.
677     //          Be aware of possible overflows if making changes here.
678     int64_t testStartLimit;
679     if (UTEXT_USES_U16(fInputText)) {
680         testStartLimit = fActiveLimit - fPattern->fMinMatchLen;
681         if (startPos > testStartLimit) {
682             fMatch = false;
683             fHitEnd = true;
684             return false;
685         }
686     } else {
687         // We don't know exactly how long the minimum match length is in native characters.
688         // Treat anything > 0 as 1.
689         testStartLimit = fActiveLimit - (fPattern->fMinMatchLen > 0 ? 1 : 0);
690     }
691 
692     UChar32  c;
693     U_ASSERT(startPos >= 0);
694 
695     switch (fPattern->fStartType) {
696     case START_NO_INFO:
697         // No optimization was found.
698         //  Try a match at each input position.
699         for (;;) {
700             MatchAt(startPos, false, status);
701             if (U_FAILURE(status)) {
702                 return false;
703             }
704             if (fMatch) {
705                 return true;
706             }
707             if (startPos >= testStartLimit) {
708                 fHitEnd = true;
709                 return false;
710             }
711             UTEXT_SETNATIVEINDEX(fInputText, startPos);
712             (void)UTEXT_NEXT32(fInputText);
713             startPos = UTEXT_GETNATIVEINDEX(fInputText);
714             // Note that it's perfectly OK for a pattern to have a zero-length
715             //   match at the end of a string, so we must make sure that the loop
716             //   runs with startPos == testStartLimit the last time through.
717             if  (findProgressInterrupt(startPos, status))
718                 return false;
719         }
720         UPRV_UNREACHABLE_EXIT;
721 
722     case START_START:
723         // Matches are only possible at the start of the input string
724         //   (pattern begins with ^ or \A)
725         if (startPos > fActiveStart) {
726             fMatch = false;
727             return false;
728         }
729         MatchAt(startPos, false, status);
730         if (U_FAILURE(status)) {
731             return false;
732         }
733         return fMatch;
734 
735 
736     case START_SET:
737         {
738             // Match may start on any char from a pre-computed set.
739             U_ASSERT(fPattern->fMinMatchLen > 0);
740             UTEXT_SETNATIVEINDEX(fInputText, startPos);
741             for (;;) {
742                 int64_t pos = startPos;
743                 c = UTEXT_NEXT32(fInputText);
744                 startPos = UTEXT_GETNATIVEINDEX(fInputText);
745                 // c will be -1 (U_SENTINEL) at end of text, in which case we
746                 // skip this next block (so we don't have a negative array index)
747                 // and handle end of text in the following block.
748                 if (c >= 0 && ((c<256 && fPattern->fInitialChars8->contains(c)) ||
749                               (c>=256 && fPattern->fInitialChars->contains(c)))) {
750                     MatchAt(pos, false, status);
751                     if (U_FAILURE(status)) {
752                         return false;
753                     }
754                     if (fMatch) {
755                         return true;
756                     }
757                     UTEXT_SETNATIVEINDEX(fInputText, pos);
758                 }
759                 if (startPos > testStartLimit) {
760                     fMatch = false;
761                     fHitEnd = true;
762                     return false;
763                 }
764                 if  (findProgressInterrupt(startPos, status))
765                     return false;
766             }
767         }
768         UPRV_UNREACHABLE_EXIT;
769 
770     case START_STRING:
771     case START_CHAR:
772         {
773             // Match starts on exactly one char.
774             U_ASSERT(fPattern->fMinMatchLen > 0);
775             UChar32 theChar = fPattern->fInitialChar;
776             UTEXT_SETNATIVEINDEX(fInputText, startPos);
777             for (;;) {
778                 int64_t pos = startPos;
779                 c = UTEXT_NEXT32(fInputText);
780                 startPos = UTEXT_GETNATIVEINDEX(fInputText);
781                 if (c == theChar) {
782                     MatchAt(pos, false, status);
783                     if (U_FAILURE(status)) {
784                         return false;
785                     }
786                     if (fMatch) {
787                         return true;
788                     }
789                     UTEXT_SETNATIVEINDEX(fInputText, startPos);
790                 }
791                 if (startPos > testStartLimit) {
792                     fMatch = false;
793                     fHitEnd = true;
794                     return false;
795                 }
796                 if  (findProgressInterrupt(startPos, status))
797                     return false;
798            }
799         }
800         UPRV_UNREACHABLE_EXIT;
801 
802     case START_LINE:
803         {
804             UChar32 ch;
805             if (startPos == fAnchorStart) {
806                 MatchAt(startPos, false, status);
807                 if (U_FAILURE(status)) {
808                     return false;
809                 }
810                 if (fMatch) {
811                     return true;
812                 }
813                 UTEXT_SETNATIVEINDEX(fInputText, startPos);
814                 ch = UTEXT_NEXT32(fInputText);
815                 startPos = UTEXT_GETNATIVEINDEX(fInputText);
816             } else {
817                 UTEXT_SETNATIVEINDEX(fInputText, startPos);
818                 ch = UTEXT_PREVIOUS32(fInputText);
819                 UTEXT_SETNATIVEINDEX(fInputText, startPos);
820             }
821 
822             if (fPattern->fFlags & UREGEX_UNIX_LINES) {
823                 for (;;) {
824                     if (ch == 0x0a) {
825                             MatchAt(startPos, false, status);
826                             if (U_FAILURE(status)) {
827                                 return false;
828                             }
829                             if (fMatch) {
830                                 return true;
831                             }
832                             UTEXT_SETNATIVEINDEX(fInputText, startPos);
833                     }
834                     if (startPos >= testStartLimit) {
835                         fMatch = false;
836                         fHitEnd = true;
837                         return false;
838                     }
839                     ch = UTEXT_NEXT32(fInputText);
840                     startPos = UTEXT_GETNATIVEINDEX(fInputText);
841                     // Note that it's perfectly OK for a pattern to have a zero-length
842                     //   match at the end of a string, so we must make sure that the loop
843                     //   runs with startPos == testStartLimit the last time through.
844                     if  (findProgressInterrupt(startPos, status))
845                         return false;
846                 }
847             } else {
848                 for (;;) {
849                     if (isLineTerminator(ch)) {
850                         if (ch == 0x0d && startPos < fActiveLimit && UTEXT_CURRENT32(fInputText) == 0x0a) {
851                             (void)UTEXT_NEXT32(fInputText);
852                             startPos = UTEXT_GETNATIVEINDEX(fInputText);
853                         }
854                         MatchAt(startPos, false, status);
855                         if (U_FAILURE(status)) {
856                             return false;
857                         }
858                         if (fMatch) {
859                             return true;
860                         }
861                         UTEXT_SETNATIVEINDEX(fInputText, startPos);
862                     }
863                     if (startPos >= testStartLimit) {
864                         fMatch = false;
865                         fHitEnd = true;
866                         return false;
867                     }
868                     ch = UTEXT_NEXT32(fInputText);
869                     startPos = UTEXT_GETNATIVEINDEX(fInputText);
870                     // Note that it's perfectly OK for a pattern to have a zero-length
871                     //   match at the end of a string, so we must make sure that the loop
872                     //   runs with startPos == testStartLimit the last time through.
873                     if  (findProgressInterrupt(startPos, status))
874                         return false;
875                 }
876             }
877         }
878 
879     default:
880         UPRV_UNREACHABLE_ASSERT;
881         // Unknown value in fPattern->fStartType, should be from StartOfMatch enum. But
882         // we have reports of this in production code, don't use UPRV_UNREACHABLE_EXIT.
883         // See ICU-21669.
884         status = U_INTERNAL_PROGRAM_ERROR;
885         return false;
886     }
887 
888     UPRV_UNREACHABLE_EXIT;
889 }
890 
891 
892 
find(int64_t start,UErrorCode & status)893 UBool RegexMatcher::find(int64_t start, UErrorCode &status) {
894     if (U_FAILURE(status)) {
895         return false;
896     }
897     if (U_FAILURE(fDeferredStatus)) {
898         status = fDeferredStatus;
899         return false;
900     }
901     this->reset();                        // Note:  Reset() is specified by Java Matcher documentation.
902                                           //        This will reset the region to be the full input length.
903     if (start < 0) {
904         status = U_INDEX_OUTOFBOUNDS_ERROR;
905         return false;
906     }
907 
908     int64_t nativeStart = start;
909     if (nativeStart < fActiveStart || nativeStart > fActiveLimit) {
910         status = U_INDEX_OUTOFBOUNDS_ERROR;
911         return false;
912     }
913     fMatchEnd = nativeStart;
914     return find(status);
915 }
916 
917 
918 //--------------------------------------------------------------------------------
919 //
920 //   findUsingChunk() -- like find(), but with the advance knowledge that the
921 //                       entire string is available in the UText's chunk buffer.
922 //
923 //--------------------------------------------------------------------------------
findUsingChunk(UErrorCode & status)924 UBool RegexMatcher::findUsingChunk(UErrorCode &status) {
925     // Start at the position of the last match end.  (Will be zero if the
926     //   matcher has been reset.
927     //
928 
929     int32_t startPos = (int32_t)fMatchEnd;
930     if (startPos==0) {
931         startPos = (int32_t)fActiveStart;
932     }
933 
934     const char16_t *inputBuf = fInputText->chunkContents;
935 
936     if (fMatch) {
937         // Save the position of any previous successful match.
938         fLastMatchEnd = fMatchEnd;
939 
940         if (fMatchStart == fMatchEnd) {
941             // Previous match had zero length.  Move start position up one position
942             //  to avoid sending find() into a loop on zero-length matches.
943             if (startPos >= fActiveLimit) {
944                 fMatch = false;
945                 fHitEnd = true;
946                 return false;
947             }
948             U16_FWD_1(inputBuf, startPos, fInputLength);
949         }
950     } else {
951         if (fLastMatchEnd >= 0) {
952             // A previous find() failed to match.  Don't try again.
953             //   (without this test, a pattern with a zero-length match
954             //    could match again at the end of an input string.)
955             fHitEnd = true;
956             return false;
957         }
958     }
959 
960 
961     // Compute the position in the input string beyond which a match can not begin, because
962     //   the minimum length match would extend past the end of the input.
963     //   Note:  some patterns that cannot match anything will have fMinMatchLength==Max Int.
964     //          Be aware of possible overflows if making changes here.
965     //   Note:  a match can begin at inputBuf + testLen; it is an inclusive limit.
966     int32_t testLen  = (int32_t)(fActiveLimit - fPattern->fMinMatchLen);
967     if (startPos > testLen) {
968         fMatch = false;
969         fHitEnd = true;
970         return false;
971     }
972 
973     UChar32  c;
974     U_ASSERT(startPos >= 0);
975 
976     switch (fPattern->fStartType) {
977     case START_NO_INFO:
978         // No optimization was found.
979         //  Try a match at each input position.
980         for (;;) {
981             MatchChunkAt(startPos, false, status);
982             if (U_FAILURE(status)) {
983                 return false;
984             }
985             if (fMatch) {
986                 return true;
987             }
988             if (startPos >= testLen) {
989                 fHitEnd = true;
990                 return false;
991             }
992             U16_FWD_1(inputBuf, startPos, fActiveLimit);
993             // Note that it's perfectly OK for a pattern to have a zero-length
994             //   match at the end of a string, so we must make sure that the loop
995             //   runs with startPos == testLen the last time through.
996             if  (findProgressInterrupt(startPos, status))
997                 return false;
998         }
999         UPRV_UNREACHABLE_EXIT;
1000 
1001     case START_START:
1002         // Matches are only possible at the start of the input string
1003         //   (pattern begins with ^ or \A)
1004         if (startPos > fActiveStart) {
1005             fMatch = false;
1006             return false;
1007         }
1008         MatchChunkAt(startPos, false, status);
1009         if (U_FAILURE(status)) {
1010             return false;
1011         }
1012         return fMatch;
1013 
1014 
1015     case START_SET:
1016     {
1017         // Match may start on any char from a pre-computed set.
1018         U_ASSERT(fPattern->fMinMatchLen > 0);
1019         for (;;) {
1020             int32_t pos = startPos;
1021             U16_NEXT(inputBuf, startPos, fActiveLimit, c);  // like c = inputBuf[startPos++];
1022             if ((c<256 && fPattern->fInitialChars8->contains(c)) ||
1023                 (c>=256 && fPattern->fInitialChars->contains(c))) {
1024                 MatchChunkAt(pos, false, status);
1025                 if (U_FAILURE(status)) {
1026                     return false;
1027                 }
1028                 if (fMatch) {
1029                     return true;
1030                 }
1031             }
1032             if (startPos > testLen) {
1033                 fMatch = false;
1034                 fHitEnd = true;
1035                 return false;
1036             }
1037             if  (findProgressInterrupt(startPos, status))
1038                 return false;
1039         }
1040     }
1041     UPRV_UNREACHABLE_EXIT;
1042 
1043     case START_STRING:
1044     case START_CHAR:
1045     {
1046         // Match starts on exactly one char.
1047         U_ASSERT(fPattern->fMinMatchLen > 0);
1048         UChar32 theChar = fPattern->fInitialChar;
1049         for (;;) {
1050             int32_t pos = startPos;
1051             U16_NEXT(inputBuf, startPos, fActiveLimit, c);  // like c = inputBuf[startPos++];
1052             if (c == theChar) {
1053                 MatchChunkAt(pos, false, status);
1054                 if (U_FAILURE(status)) {
1055                     return false;
1056                 }
1057                 if (fMatch) {
1058                     return true;
1059                 }
1060             }
1061             if (startPos > testLen) {
1062                 fMatch = false;
1063                 fHitEnd = true;
1064                 return false;
1065             }
1066             if  (findProgressInterrupt(startPos, status))
1067                 return false;
1068         }
1069     }
1070     UPRV_UNREACHABLE_EXIT;
1071 
1072     case START_LINE:
1073     {
1074         UChar32 ch;
1075         if (startPos == fAnchorStart) {
1076             MatchChunkAt(startPos, false, status);
1077             if (U_FAILURE(status)) {
1078                 return false;
1079             }
1080             if (fMatch) {
1081                 return true;
1082             }
1083             U16_FWD_1(inputBuf, startPos, fActiveLimit);
1084         }
1085 
1086         if (fPattern->fFlags & UREGEX_UNIX_LINES) {
1087             for (;;) {
1088                 ch = inputBuf[startPos-1];
1089                 if (ch == 0x0a) {
1090                     MatchChunkAt(startPos, false, status);
1091                     if (U_FAILURE(status)) {
1092                         return false;
1093                     }
1094                     if (fMatch) {
1095                         return true;
1096                     }
1097                 }
1098                 if (startPos >= testLen) {
1099                     fMatch = false;
1100                     fHitEnd = true;
1101                     return false;
1102                 }
1103                 U16_FWD_1(inputBuf, startPos, fActiveLimit);
1104                 // Note that it's perfectly OK for a pattern to have a zero-length
1105                 //   match at the end of a string, so we must make sure that the loop
1106                 //   runs with startPos == testLen the last time through.
1107                 if  (findProgressInterrupt(startPos, status))
1108                     return false;
1109             }
1110         } else {
1111             for (;;) {
1112                 ch = inputBuf[startPos-1];
1113                 if (isLineTerminator(ch)) {
1114                     if (ch == 0x0d && startPos < fActiveLimit && inputBuf[startPos] == 0x0a) {
1115                         startPos++;
1116                     }
1117                     MatchChunkAt(startPos, false, status);
1118                     if (U_FAILURE(status)) {
1119                         return false;
1120                     }
1121                     if (fMatch) {
1122                         return true;
1123                     }
1124                 }
1125                 if (startPos >= testLen) {
1126                     fMatch = false;
1127                     fHitEnd = true;
1128                     return false;
1129                 }
1130                 U16_FWD_1(inputBuf, startPos, fActiveLimit);
1131                 // Note that it's perfectly OK for a pattern to have a zero-length
1132                 //   match at the end of a string, so we must make sure that the loop
1133                 //   runs with startPos == testLen the last time through.
1134                 if  (findProgressInterrupt(startPos, status))
1135                     return false;
1136             }
1137         }
1138     }
1139 
1140     default:
1141         UPRV_UNREACHABLE_ASSERT;
1142         // Unknown value in fPattern->fStartType, should be from StartOfMatch enum. But
1143         // we have reports of this in production code, don't use UPRV_UNREACHABLE_EXIT.
1144         // See ICU-21669.
1145         status = U_INTERNAL_PROGRAM_ERROR;
1146         return false;
1147     }
1148 
1149     UPRV_UNREACHABLE_EXIT;
1150 }
1151 
1152 
1153 
1154 //--------------------------------------------------------------------------------
1155 //
1156 //  group()
1157 //
1158 //--------------------------------------------------------------------------------
group(UErrorCode & status) const1159 UnicodeString RegexMatcher::group(UErrorCode &status) const {
1160     return group(0, status);
1161 }
1162 
1163 //  Return immutable shallow clone
group(UText * dest,int64_t & group_len,UErrorCode & status) const1164 UText *RegexMatcher::group(UText *dest, int64_t &group_len, UErrorCode &status) const {
1165     return group(0, dest, group_len, status);
1166 }
1167 
1168 //  Return immutable shallow clone
group(int32_t groupNum,UText * dest,int64_t & group_len,UErrorCode & status) const1169 UText *RegexMatcher::group(int32_t groupNum, UText *dest, int64_t &group_len, UErrorCode &status) const {
1170     group_len = 0;
1171     if (U_FAILURE(status)) {
1172         return dest;
1173     }
1174     if (U_FAILURE(fDeferredStatus)) {
1175         status = fDeferredStatus;
1176     } else if (fMatch == false) {
1177         status = U_REGEX_INVALID_STATE;
1178     } else if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) {
1179         status = U_INDEX_OUTOFBOUNDS_ERROR;
1180     }
1181 
1182     if (U_FAILURE(status)) {
1183         return dest;
1184     }
1185 
1186     int64_t s, e;
1187     if (groupNum == 0) {
1188         s = fMatchStart;
1189         e = fMatchEnd;
1190     } else {
1191         int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1);
1192         U_ASSERT(groupOffset < fPattern->fFrameSize);
1193         U_ASSERT(groupOffset >= 0);
1194         s = fFrame->fExtra[groupOffset];
1195         e = fFrame->fExtra[groupOffset+1];
1196     }
1197 
1198     if (s < 0) {
1199         // A capture group wasn't part of the match
1200         return utext_clone(dest, fInputText, false, true, &status);
1201     }
1202     U_ASSERT(s <= e);
1203     group_len = e - s;
1204 
1205     dest = utext_clone(dest, fInputText, false, true, &status);
1206     if (dest)
1207         UTEXT_SETNATIVEINDEX(dest, s);
1208     return dest;
1209 }
1210 
group(int32_t groupNum,UErrorCode & status) const1211 UnicodeString RegexMatcher::group(int32_t groupNum, UErrorCode &status) const {
1212     UnicodeString result;
1213     int64_t groupStart = start64(groupNum, status);
1214     int64_t groupEnd = end64(groupNum, status);
1215     if (U_FAILURE(status) || groupStart == -1 || groupStart == groupEnd) {
1216         return result;
1217     }
1218 
1219     // Get the group length using a utext_extract preflight.
1220     //    UText is actually pretty efficient at this when underlying encoding is UTF-16.
1221     int32_t length = utext_extract(fInputText, groupStart, groupEnd, nullptr, 0, &status);
1222     if (status != U_BUFFER_OVERFLOW_ERROR) {
1223         return result;
1224     }
1225 
1226     status = U_ZERO_ERROR;
1227     char16_t *buf = result.getBuffer(length);
1228     if (buf == nullptr) {
1229         status = U_MEMORY_ALLOCATION_ERROR;
1230     } else {
1231         int32_t extractLength = utext_extract(fInputText, groupStart, groupEnd, buf, length, &status);
1232         result.releaseBuffer(extractLength);
1233         U_ASSERT(length == extractLength);
1234     }
1235     return result;
1236 }
1237 
1238 
1239 //--------------------------------------------------------------------------------
1240 //
1241 //  appendGroup() -- currently internal only, appends a group to a UText rather
1242 //                   than replacing its contents
1243 //
1244 //--------------------------------------------------------------------------------
1245 
appendGroup(int32_t groupNum,UText * dest,UErrorCode & status) const1246 int64_t RegexMatcher::appendGroup(int32_t groupNum, UText *dest, UErrorCode &status) const {
1247     if (U_FAILURE(status)) {
1248         return 0;
1249     }
1250     if (U_FAILURE(fDeferredStatus)) {
1251         status = fDeferredStatus;
1252         return 0;
1253     }
1254     int64_t destLen = utext_nativeLength(dest);
1255 
1256     if (fMatch == false) {
1257         status = U_REGEX_INVALID_STATE;
1258         return utext_replace(dest, destLen, destLen, nullptr, 0, &status);
1259     }
1260     if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) {
1261         status = U_INDEX_OUTOFBOUNDS_ERROR;
1262         return utext_replace(dest, destLen, destLen, nullptr, 0, &status);
1263     }
1264 
1265     int64_t s, e;
1266     if (groupNum == 0) {
1267         s = fMatchStart;
1268         e = fMatchEnd;
1269     } else {
1270         int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1);
1271         U_ASSERT(groupOffset < fPattern->fFrameSize);
1272         U_ASSERT(groupOffset >= 0);
1273         s = fFrame->fExtra[groupOffset];
1274         e = fFrame->fExtra[groupOffset+1];
1275     }
1276 
1277     if (s < 0) {
1278         // A capture group wasn't part of the match
1279         return utext_replace(dest, destLen, destLen, nullptr, 0, &status);
1280     }
1281     U_ASSERT(s <= e);
1282 
1283     int64_t deltaLen;
1284     if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1285         U_ASSERT(e <= fInputLength);
1286         deltaLen = utext_replace(dest, destLen, destLen, fInputText->chunkContents+s, (int32_t)(e-s), &status);
1287     } else {
1288         int32_t len16;
1289         if (UTEXT_USES_U16(fInputText)) {
1290             len16 = (int32_t)(e-s);
1291         } else {
1292             UErrorCode lengthStatus = U_ZERO_ERROR;
1293             len16 = utext_extract(fInputText, s, e, nullptr, 0, &lengthStatus);
1294         }
1295         char16_t *groupChars = (char16_t *)uprv_malloc(sizeof(char16_t)*(len16+1));
1296         if (groupChars == nullptr) {
1297             status = U_MEMORY_ALLOCATION_ERROR;
1298             return 0;
1299         }
1300         utext_extract(fInputText, s, e, groupChars, len16+1, &status);
1301 
1302         deltaLen = utext_replace(dest, destLen, destLen, groupChars, len16, &status);
1303         uprv_free(groupChars);
1304     }
1305     return deltaLen;
1306 }
1307 
1308 
1309 
1310 //--------------------------------------------------------------------------------
1311 //
1312 //  groupCount()
1313 //
1314 //--------------------------------------------------------------------------------
groupCount() const1315 int32_t RegexMatcher::groupCount() const {
1316     return fPattern->fGroupMap->size();
1317 }
1318 
1319 //--------------------------------------------------------------------------------
1320 //
1321 //  hasAnchoringBounds()
1322 //
1323 //--------------------------------------------------------------------------------
hasAnchoringBounds() const1324 UBool RegexMatcher::hasAnchoringBounds() const {
1325     return fAnchoringBounds;
1326 }
1327 
1328 
1329 //--------------------------------------------------------------------------------
1330 //
1331 //  hasTransparentBounds()
1332 //
1333 //--------------------------------------------------------------------------------
hasTransparentBounds() const1334 UBool RegexMatcher::hasTransparentBounds() const {
1335     return fTransparentBounds;
1336 }
1337 
1338 
1339 
1340 //--------------------------------------------------------------------------------
1341 //
1342 //  hitEnd()
1343 //
1344 //--------------------------------------------------------------------------------
hitEnd() const1345 UBool RegexMatcher::hitEnd() const {
1346     return fHitEnd;
1347 }
1348 
1349 
1350 //--------------------------------------------------------------------------------
1351 //
1352 //  input()
1353 //
1354 //--------------------------------------------------------------------------------
input() const1355 const UnicodeString &RegexMatcher::input() const {
1356     if (!fInput) {
1357         UErrorCode status = U_ZERO_ERROR;
1358         int32_t len16;
1359         if (UTEXT_USES_U16(fInputText)) {
1360             len16 = (int32_t)fInputLength;
1361         } else {
1362             len16 = utext_extract(fInputText, 0, fInputLength, nullptr, 0, &status);
1363             status = U_ZERO_ERROR; // overflow, length status
1364         }
1365         UnicodeString *result = new UnicodeString(len16, 0, 0);
1366 
1367         char16_t *inputChars = result->getBuffer(len16);
1368         utext_extract(fInputText, 0, fInputLength, inputChars, len16, &status); // unterminated warning
1369         result->releaseBuffer(len16);
1370 
1371         (*(const UnicodeString **)&fInput) = result; // pointer assignment, rather than operator=
1372     }
1373 
1374     return *fInput;
1375 }
1376 
1377 //--------------------------------------------------------------------------------
1378 //
1379 //  inputText()
1380 //
1381 //--------------------------------------------------------------------------------
inputText() const1382 UText *RegexMatcher::inputText() const {
1383     return fInputText;
1384 }
1385 
1386 
1387 //--------------------------------------------------------------------------------
1388 //
1389 //  getInput() -- like inputText(), but makes a clone or copies into another UText
1390 //
1391 //--------------------------------------------------------------------------------
getInput(UText * dest,UErrorCode & status) const1392 UText *RegexMatcher::getInput (UText *dest, UErrorCode &status) const {
1393     if (U_FAILURE(status)) {
1394         return dest;
1395     }
1396     if (U_FAILURE(fDeferredStatus)) {
1397         status = fDeferredStatus;
1398         return dest;
1399     }
1400 
1401     if (dest) {
1402         if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1403             utext_replace(dest, 0, utext_nativeLength(dest), fInputText->chunkContents, (int32_t)fInputLength, &status);
1404         } else {
1405             int32_t input16Len;
1406             if (UTEXT_USES_U16(fInputText)) {
1407                 input16Len = (int32_t)fInputLength;
1408             } else {
1409                 UErrorCode lengthStatus = U_ZERO_ERROR;
1410                 input16Len = utext_extract(fInputText, 0, fInputLength, nullptr, 0, &lengthStatus); // buffer overflow error
1411             }
1412             char16_t *inputChars = (char16_t *)uprv_malloc(sizeof(char16_t)*(input16Len));
1413             if (inputChars == nullptr) {
1414                 return dest;
1415             }
1416 
1417             status = U_ZERO_ERROR;
1418             utext_extract(fInputText, 0, fInputLength, inputChars, input16Len, &status); // not terminated warning
1419             status = U_ZERO_ERROR;
1420             utext_replace(dest, 0, utext_nativeLength(dest), inputChars, input16Len, &status);
1421 
1422             uprv_free(inputChars);
1423         }
1424         return dest;
1425     } else {
1426         return utext_clone(nullptr, fInputText, false, true, &status);
1427     }
1428 }
1429 
1430 
1431 static UBool compat_SyncMutableUTextContents(UText *ut);
compat_SyncMutableUTextContents(UText * ut)1432 static UBool compat_SyncMutableUTextContents(UText *ut) {
1433     UBool retVal = false;
1434 
1435     //  In the following test, we're really only interested in whether the UText should switch
1436     //  between heap and stack allocation.  If length hasn't changed, we won't, so the chunkContents
1437     //  will still point to the correct data.
1438     if (utext_nativeLength(ut) != ut->nativeIndexingLimit) {
1439         UnicodeString *us=(UnicodeString *)ut->context;
1440 
1441         // Update to the latest length.
1442         // For example, (utext_nativeLength(ut) != ut->nativeIndexingLimit).
1443         int32_t newLength = us->length();
1444 
1445         // Update the chunk description.
1446         // The buffer may have switched between stack- and heap-based.
1447         ut->chunkContents    = us->getBuffer();
1448         ut->chunkLength      = newLength;
1449         ut->chunkNativeLimit = newLength;
1450         ut->nativeIndexingLimit = newLength;
1451         retVal = true;
1452     }
1453 
1454     return retVal;
1455 }
1456 
1457 //--------------------------------------------------------------------------------
1458 //
1459 //  lookingAt()
1460 //
1461 //--------------------------------------------------------------------------------
lookingAt(UErrorCode & status)1462 UBool RegexMatcher::lookingAt(UErrorCode &status) {
1463     if (U_FAILURE(status)) {
1464         return false;
1465     }
1466     if (U_FAILURE(fDeferredStatus)) {
1467         status = fDeferredStatus;
1468         return false;
1469     }
1470 
1471     if (fInputUniStrMaybeMutable) {
1472         if (compat_SyncMutableUTextContents(fInputText)) {
1473         fInputLength = utext_nativeLength(fInputText);
1474         reset();
1475         }
1476     }
1477     else {
1478         resetPreserveRegion();
1479     }
1480     if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1481         MatchChunkAt((int32_t)fActiveStart, false, status);
1482     } else {
1483         MatchAt(fActiveStart, false, status);
1484     }
1485     return fMatch;
1486 }
1487 
1488 
lookingAt(int64_t start,UErrorCode & status)1489 UBool RegexMatcher::lookingAt(int64_t start, UErrorCode &status) {
1490     if (U_FAILURE(status)) {
1491         return false;
1492     }
1493     if (U_FAILURE(fDeferredStatus)) {
1494         status = fDeferredStatus;
1495         return false;
1496     }
1497     reset();
1498 
1499     if (start < 0) {
1500         status = U_INDEX_OUTOFBOUNDS_ERROR;
1501         return false;
1502     }
1503 
1504     if (fInputUniStrMaybeMutable) {
1505         if (compat_SyncMutableUTextContents(fInputText)) {
1506         fInputLength = utext_nativeLength(fInputText);
1507         reset();
1508         }
1509     }
1510 
1511     int64_t nativeStart;
1512     nativeStart = start;
1513     if (nativeStart < fActiveStart || nativeStart > fActiveLimit) {
1514         status = U_INDEX_OUTOFBOUNDS_ERROR;
1515         return false;
1516     }
1517 
1518     if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1519         MatchChunkAt((int32_t)nativeStart, false, status);
1520     } else {
1521         MatchAt(nativeStart, false, status);
1522     }
1523     return fMatch;
1524 }
1525 
1526 
1527 
1528 //--------------------------------------------------------------------------------
1529 //
1530 //  matches()
1531 //
1532 //--------------------------------------------------------------------------------
matches(UErrorCode & status)1533 UBool RegexMatcher::matches(UErrorCode &status) {
1534     if (U_FAILURE(status)) {
1535         return false;
1536     }
1537     if (U_FAILURE(fDeferredStatus)) {
1538         status = fDeferredStatus;
1539         return false;
1540     }
1541 
1542     if (fInputUniStrMaybeMutable) {
1543         if (compat_SyncMutableUTextContents(fInputText)) {
1544         fInputLength = utext_nativeLength(fInputText);
1545         reset();
1546         }
1547     }
1548     else {
1549         resetPreserveRegion();
1550     }
1551 
1552     if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1553         MatchChunkAt((int32_t)fActiveStart, true, status);
1554     } else {
1555         MatchAt(fActiveStart, true, status);
1556     }
1557     return fMatch;
1558 }
1559 
1560 
matches(int64_t start,UErrorCode & status)1561 UBool RegexMatcher::matches(int64_t start, UErrorCode &status) {
1562     if (U_FAILURE(status)) {
1563         return false;
1564     }
1565     if (U_FAILURE(fDeferredStatus)) {
1566         status = fDeferredStatus;
1567         return false;
1568     }
1569     reset();
1570 
1571     if (start < 0) {
1572         status = U_INDEX_OUTOFBOUNDS_ERROR;
1573         return false;
1574     }
1575 
1576     if (fInputUniStrMaybeMutable) {
1577         if (compat_SyncMutableUTextContents(fInputText)) {
1578         fInputLength = utext_nativeLength(fInputText);
1579         reset();
1580         }
1581     }
1582 
1583     int64_t nativeStart;
1584     nativeStart = start;
1585     if (nativeStart < fActiveStart || nativeStart > fActiveLimit) {
1586         status = U_INDEX_OUTOFBOUNDS_ERROR;
1587         return false;
1588     }
1589 
1590     if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1591         MatchChunkAt((int32_t)nativeStart, true, status);
1592     } else {
1593         MatchAt(nativeStart, true, status);
1594     }
1595     return fMatch;
1596 }
1597 
1598 
1599 
1600 //--------------------------------------------------------------------------------
1601 //
1602 //    pattern
1603 //
1604 //--------------------------------------------------------------------------------
pattern() const1605 const RegexPattern &RegexMatcher::pattern() const {
1606     return *fPattern;
1607 }
1608 
1609 
1610 
1611 //--------------------------------------------------------------------------------
1612 //
1613 //    region
1614 //
1615 //--------------------------------------------------------------------------------
region(int64_t regionStart,int64_t regionLimit,int64_t startIndex,UErrorCode & status)1616 RegexMatcher &RegexMatcher::region(int64_t regionStart, int64_t regionLimit, int64_t startIndex, UErrorCode &status) {
1617     if (U_FAILURE(status)) {
1618         return *this;
1619     }
1620 
1621     if (regionStart>regionLimit || regionStart<0 || regionLimit<0) {
1622         status = U_ILLEGAL_ARGUMENT_ERROR;
1623     }
1624 
1625     int64_t nativeStart = regionStart;
1626     int64_t nativeLimit = regionLimit;
1627     if (nativeStart > fInputLength || nativeLimit > fInputLength) {
1628       status = U_ILLEGAL_ARGUMENT_ERROR;
1629     }
1630 
1631     if (startIndex == -1)
1632       this->reset();
1633     else
1634       resetPreserveRegion();
1635 
1636     fRegionStart = nativeStart;
1637     fRegionLimit = nativeLimit;
1638     fActiveStart = nativeStart;
1639     fActiveLimit = nativeLimit;
1640 
1641     if (startIndex != -1) {
1642       if (startIndex < fActiveStart || startIndex > fActiveLimit) {
1643           status = U_INDEX_OUTOFBOUNDS_ERROR;
1644       }
1645       fMatchEnd = startIndex;
1646     }
1647 
1648     if (!fTransparentBounds) {
1649         fLookStart = nativeStart;
1650         fLookLimit = nativeLimit;
1651     }
1652     if (fAnchoringBounds) {
1653         fAnchorStart = nativeStart;
1654         fAnchorLimit = nativeLimit;
1655     }
1656     return *this;
1657 }
1658 
region(int64_t start,int64_t limit,UErrorCode & status)1659 RegexMatcher &RegexMatcher::region(int64_t start, int64_t limit, UErrorCode &status) {
1660   return region(start, limit, -1, status);
1661 }
1662 
1663 //--------------------------------------------------------------------------------
1664 //
1665 //    regionEnd
1666 //
1667 //--------------------------------------------------------------------------------
regionEnd() const1668 int32_t RegexMatcher::regionEnd() const {
1669     return (int32_t)fRegionLimit;
1670 }
1671 
regionEnd64() const1672 int64_t RegexMatcher::regionEnd64() const {
1673     return fRegionLimit;
1674 }
1675 
1676 //--------------------------------------------------------------------------------
1677 //
1678 //    regionStart
1679 //
1680 //--------------------------------------------------------------------------------
regionStart() const1681 int32_t RegexMatcher::regionStart() const {
1682     return (int32_t)fRegionStart;
1683 }
1684 
regionStart64() const1685 int64_t RegexMatcher::regionStart64() const {
1686     return fRegionStart;
1687 }
1688 
1689 
1690 //--------------------------------------------------------------------------------
1691 //
1692 //    replaceAll
1693 //
1694 //--------------------------------------------------------------------------------
replaceAll(const UnicodeString & replacement,UErrorCode & status)1695 UnicodeString RegexMatcher::replaceAll(const UnicodeString &replacement, UErrorCode &status) {
1696     UText replacementText = UTEXT_INITIALIZER;
1697     UText resultText = UTEXT_INITIALIZER;
1698     UnicodeString resultString;
1699     if (U_FAILURE(status)) {
1700         return resultString;
1701     }
1702 
1703     utext_openConstUnicodeString(&replacementText, &replacement, &status);
1704     utext_openUnicodeString(&resultText, &resultString, &status);
1705 
1706     replaceAll(&replacementText, &resultText, status);
1707 
1708     utext_close(&resultText);
1709     utext_close(&replacementText);
1710 
1711     return resultString;
1712 }
1713 
1714 
1715 //
1716 //    replaceAll, UText mode
1717 //
replaceAll(UText * replacement,UText * dest,UErrorCode & status)1718 UText *RegexMatcher::replaceAll(UText *replacement, UText *dest, UErrorCode &status) {
1719     if (U_FAILURE(status)) {
1720         return dest;
1721     }
1722     if (U_FAILURE(fDeferredStatus)) {
1723         status = fDeferredStatus;
1724         return dest;
1725     }
1726 
1727     if (dest == nullptr) {
1728         UnicodeString emptyString;
1729         UText empty = UTEXT_INITIALIZER;
1730 
1731         utext_openUnicodeString(&empty, &emptyString, &status);
1732         dest = utext_clone(nullptr, &empty, true, false, &status);
1733         utext_close(&empty);
1734     }
1735 
1736     if (U_SUCCESS(status)) {
1737         reset();
1738         while (find()) {
1739             appendReplacement(dest, replacement, status);
1740             if (U_FAILURE(status)) {
1741                 break;
1742             }
1743         }
1744         appendTail(dest, status);
1745     }
1746 
1747     return dest;
1748 }
1749 
1750 
1751 //--------------------------------------------------------------------------------
1752 //
1753 //    replaceFirst
1754 //
1755 //--------------------------------------------------------------------------------
replaceFirst(const UnicodeString & replacement,UErrorCode & status)1756 UnicodeString RegexMatcher::replaceFirst(const UnicodeString &replacement, UErrorCode &status) {
1757     UText replacementText = UTEXT_INITIALIZER;
1758     UText resultText = UTEXT_INITIALIZER;
1759     UnicodeString resultString;
1760 
1761     utext_openConstUnicodeString(&replacementText, &replacement, &status);
1762     utext_openUnicodeString(&resultText, &resultString, &status);
1763 
1764     replaceFirst(&replacementText, &resultText, status);
1765 
1766     utext_close(&resultText);
1767     utext_close(&replacementText);
1768 
1769     return resultString;
1770 }
1771 
1772 //
1773 //    replaceFirst, UText mode
1774 //
replaceFirst(UText * replacement,UText * dest,UErrorCode & status)1775 UText *RegexMatcher::replaceFirst(UText *replacement, UText *dest, UErrorCode &status) {
1776     if (U_FAILURE(status)) {
1777         return dest;
1778     }
1779     if (U_FAILURE(fDeferredStatus)) {
1780         status = fDeferredStatus;
1781         return dest;
1782     }
1783 
1784     reset();
1785     if (!find()) {
1786         return getInput(dest, status);
1787     }
1788 
1789     if (dest == nullptr) {
1790         UnicodeString emptyString;
1791         UText empty = UTEXT_INITIALIZER;
1792 
1793         utext_openUnicodeString(&empty, &emptyString, &status);
1794         dest = utext_clone(nullptr, &empty, true, false, &status);
1795         utext_close(&empty);
1796     }
1797 
1798     appendReplacement(dest, replacement, status);
1799     appendTail(dest, status);
1800 
1801     return dest;
1802 }
1803 
1804 
1805 //--------------------------------------------------------------------------------
1806 //
1807 //     requireEnd
1808 //
1809 //--------------------------------------------------------------------------------
requireEnd() const1810 UBool RegexMatcher::requireEnd() const {
1811     return fRequireEnd;
1812 }
1813 
1814 
1815 //--------------------------------------------------------------------------------
1816 //
1817 //     reset
1818 //
1819 //--------------------------------------------------------------------------------
reset()1820 RegexMatcher &RegexMatcher::reset() {
1821     fRegionStart    = 0;
1822     fRegionLimit    = fInputLength;
1823     fActiveStart    = 0;
1824     fActiveLimit    = fInputLength;
1825     fAnchorStart    = 0;
1826     fAnchorLimit    = fInputLength;
1827     fLookStart      = 0;
1828     fLookLimit      = fInputLength;
1829     resetPreserveRegion();
1830     return *this;
1831 }
1832 
1833 
1834 
resetPreserveRegion()1835 void RegexMatcher::resetPreserveRegion() {
1836     fMatchStart     = 0;
1837     fMatchEnd       = 0;
1838     fLastMatchEnd   = -1;
1839     fAppendPosition = 0;
1840     fMatch          = false;
1841     fHitEnd         = false;
1842     fRequireEnd     = false;
1843     fTime           = 0;
1844     fTickCounter    = TIMER_INITIAL_VALUE;
1845     //resetStack(); // more expensive than it looks...
1846 }
1847 
1848 
reset(const UnicodeString & input)1849 RegexMatcher &RegexMatcher::reset(const UnicodeString &input) {
1850     fInputText = utext_openConstUnicodeString(fInputText, &input, &fDeferredStatus);
1851     if (fPattern->fNeedsAltInput) {
1852         fAltInputText = utext_clone(fAltInputText, fInputText, false, true, &fDeferredStatus);
1853     }
1854     if (U_FAILURE(fDeferredStatus)) {
1855         return *this;
1856     }
1857     fInputLength = utext_nativeLength(fInputText);
1858 
1859     reset();
1860     delete fInput;
1861     fInput = nullptr;
1862 
1863     //  Do the following for any UnicodeString.
1864     //  This is for compatibility for those clients who modify the input string "live" during regex operations.
1865     fInputUniStrMaybeMutable = true;
1866 
1867 #if UCONFIG_NO_BREAK_ITERATION==0
1868     if (fWordBreakItr) {
1869         fWordBreakItr->setText(fInputText, fDeferredStatus);
1870     }
1871     if (fGCBreakItr) {
1872         fGCBreakItr->setText(fInputText, fDeferredStatus);
1873     }
1874 #endif
1875 
1876     return *this;
1877 }
1878 
1879 
reset(UText * input)1880 RegexMatcher &RegexMatcher::reset(UText *input) {
1881     if (fInputText != input) {
1882         fInputText = utext_clone(fInputText, input, false, true, &fDeferredStatus);
1883         if (fPattern->fNeedsAltInput) fAltInputText = utext_clone(fAltInputText, fInputText, false, true, &fDeferredStatus);
1884         if (U_FAILURE(fDeferredStatus)) {
1885             return *this;
1886         }
1887         fInputLength = utext_nativeLength(fInputText);
1888 
1889         delete fInput;
1890         fInput = nullptr;
1891 
1892 #if UCONFIG_NO_BREAK_ITERATION==0
1893         if (fWordBreakItr) {
1894             fWordBreakItr->setText(input, fDeferredStatus);
1895         }
1896         if (fGCBreakItr) {
1897             fGCBreakItr->setText(fInputText, fDeferredStatus);
1898         }
1899 #endif
1900     }
1901     reset();
1902     fInputUniStrMaybeMutable = false;
1903 
1904     return *this;
1905 }
1906 
1907 /*RegexMatcher &RegexMatcher::reset(const char16_t *) {
1908     fDeferredStatus = U_INTERNAL_PROGRAM_ERROR;
1909     return *this;
1910 }*/
1911 
reset(int64_t position,UErrorCode & status)1912 RegexMatcher &RegexMatcher::reset(int64_t position, UErrorCode &status) {
1913     if (U_FAILURE(status)) {
1914         return *this;
1915     }
1916     reset();       // Reset also resets the region to be the entire string.
1917 
1918     if (position < 0 || position > fActiveLimit) {
1919         status = U_INDEX_OUTOFBOUNDS_ERROR;
1920         return *this;
1921     }
1922     fMatchEnd = position;
1923     return *this;
1924 }
1925 
1926 
1927 //--------------------------------------------------------------------------------
1928 //
1929 //    refresh
1930 //
1931 //--------------------------------------------------------------------------------
refreshInputText(UText * input,UErrorCode & status)1932 RegexMatcher &RegexMatcher::refreshInputText(UText *input, UErrorCode &status) {
1933     if (U_FAILURE(status)) {
1934         return *this;
1935     }
1936     if (input == nullptr) {
1937         status = U_ILLEGAL_ARGUMENT_ERROR;
1938         return *this;
1939     }
1940     if (utext_nativeLength(fInputText) != utext_nativeLength(input)) {
1941         status = U_ILLEGAL_ARGUMENT_ERROR;
1942         return *this;
1943     }
1944     int64_t  pos = utext_getNativeIndex(fInputText);
1945     //  Shallow read-only clone of the new UText into the existing input UText
1946     fInputText = utext_clone(fInputText, input, false, true, &status);
1947     if (U_FAILURE(status)) {
1948         return *this;
1949     }
1950     utext_setNativeIndex(fInputText, pos);
1951 
1952     if (fAltInputText != nullptr) {
1953         pos = utext_getNativeIndex(fAltInputText);
1954         fAltInputText = utext_clone(fAltInputText, input, false, true, &status);
1955         if (U_FAILURE(status)) {
1956             return *this;
1957         }
1958         utext_setNativeIndex(fAltInputText, pos);
1959     }
1960     return *this;
1961 }
1962 
1963 
1964 
1965 //--------------------------------------------------------------------------------
1966 //
1967 //    setTrace
1968 //
1969 //--------------------------------------------------------------------------------
setTrace(UBool state)1970 void RegexMatcher::setTrace(UBool state) {
1971     fTraceDebug = state;
1972 }
1973 
1974 
1975 
1976 /**
1977   *  UText, replace entire contents of the destination UText with a substring of the source UText.
1978   *
1979   *     @param src    The source UText
1980   *     @param dest   The destination UText. Must be writable.
1981   *                   May be nullptr, in which case a new UText will be allocated.
1982   *     @param start  Start index of source substring.
1983   *     @param limit  Limit index of source substring.
1984   *     @param status An error code.
1985   */
utext_extract_replace(UText * src,UText * dest,int64_t start,int64_t limit,UErrorCode * status)1986 static UText *utext_extract_replace(UText *src, UText *dest, int64_t start, int64_t limit, UErrorCode *status) {
1987     if (U_FAILURE(*status)) {
1988         return dest;
1989     }
1990     if (start == limit) {
1991         if (dest) {
1992             utext_replace(dest, 0, utext_nativeLength(dest), nullptr, 0, status);
1993             return dest;
1994         } else {
1995             return utext_openUChars(nullptr, nullptr, 0, status);
1996         }
1997     }
1998     int32_t length = utext_extract(src, start, limit, nullptr, 0, status);
1999     if (*status != U_BUFFER_OVERFLOW_ERROR && U_FAILURE(*status)) {
2000         return dest;
2001     }
2002     *status = U_ZERO_ERROR;
2003     MaybeStackArray<char16_t, 40> buffer;
2004     if (length >= buffer.getCapacity()) {
2005         char16_t *newBuf = buffer.resize(length+1);   // Leave space for terminating Nul.
2006         if (newBuf == nullptr) {
2007             *status = U_MEMORY_ALLOCATION_ERROR;
2008         }
2009     }
2010     utext_extract(src, start, limit, buffer.getAlias(), length+1, status);
2011     if (dest) {
2012         utext_replace(dest, 0, utext_nativeLength(dest), buffer.getAlias(), length, status);
2013         return dest;
2014     }
2015 
2016     // Caller did not provide a preexisting UText.
2017     // Open a new one, and have it adopt the text buffer storage.
2018     if (U_FAILURE(*status)) {
2019         return nullptr;
2020     }
2021     int32_t ownedLength = 0;
2022     char16_t *ownedBuf = buffer.orphanOrClone(length+1, ownedLength);
2023     if (ownedBuf == nullptr) {
2024         *status = U_MEMORY_ALLOCATION_ERROR;
2025         return nullptr;
2026     }
2027     UText *result = utext_openUChars(nullptr, ownedBuf, length, status);
2028     if (U_FAILURE(*status)) {
2029         uprv_free(ownedBuf);
2030         return nullptr;
2031     }
2032     result->providerProperties |= (1 << UTEXT_PROVIDER_OWNS_TEXT);
2033     return result;
2034 }
2035 
2036 
2037 //---------------------------------------------------------------------
2038 //
2039 //   split
2040 //
2041 //---------------------------------------------------------------------
split(const UnicodeString & input,UnicodeString dest[],int32_t destCapacity,UErrorCode & status)2042 int32_t  RegexMatcher::split(const UnicodeString &input,
2043         UnicodeString    dest[],
2044         int32_t          destCapacity,
2045         UErrorCode      &status)
2046 {
2047     UText inputText = UTEXT_INITIALIZER;
2048     utext_openConstUnicodeString(&inputText, &input, &status);
2049     if (U_FAILURE(status)) {
2050         return 0;
2051     }
2052 
2053     UText **destText = (UText **)uprv_malloc(sizeof(UText*)*destCapacity);
2054     if (destText == nullptr) {
2055         status = U_MEMORY_ALLOCATION_ERROR;
2056         return 0;
2057     }
2058     int32_t i;
2059     for (i = 0; i < destCapacity; i++) {
2060         destText[i] = utext_openUnicodeString(nullptr, &dest[i], &status);
2061     }
2062 
2063     int32_t fieldCount = split(&inputText, destText, destCapacity, status);
2064 
2065     for (i = 0; i < destCapacity; i++) {
2066         utext_close(destText[i]);
2067     }
2068 
2069     uprv_free(destText);
2070     utext_close(&inputText);
2071     return fieldCount;
2072 }
2073 
2074 //
2075 //   split, UText mode
2076 //
split(UText * input,UText * dest[],int32_t destCapacity,UErrorCode & status)2077 int32_t  RegexMatcher::split(UText *input,
2078         UText           *dest[],
2079         int32_t          destCapacity,
2080         UErrorCode      &status)
2081 {
2082     //
2083     // Check arguments for validity
2084     //
2085     if (U_FAILURE(status)) {
2086         return 0;
2087     }
2088 
2089     if (destCapacity < 1) {
2090         status = U_ILLEGAL_ARGUMENT_ERROR;
2091         return 0;
2092     }
2093 
2094     //
2095     // Reset for the input text
2096     //
2097     reset(input);
2098     int64_t   nextOutputStringStart = 0;
2099     if (fActiveLimit == 0) {
2100         return 0;
2101     }
2102 
2103     //
2104     // Loop through the input text, searching for the delimiter pattern
2105     //
2106     int32_t i;
2107     int32_t numCaptureGroups = fPattern->fGroupMap->size();
2108     for (i=0; ; i++) {
2109         if (i>=destCapacity-1) {
2110             // There is one or zero output string left.
2111             // Fill the last output string with whatever is left from the input, then exit the loop.
2112             //  ( i will be == destCapacity if we filled the output array while processing
2113             //    capture groups of the delimiter expression, in which case we will discard the
2114             //    last capture group saved in favor of the unprocessed remainder of the
2115             //    input string.)
2116             i = destCapacity-1;
2117             if (fActiveLimit > nextOutputStringStart) {
2118                 if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) {
2119                     if (dest[i]) {
2120                         utext_replace(dest[i], 0, utext_nativeLength(dest[i]),
2121                                       input->chunkContents+nextOutputStringStart,
2122                                       (int32_t)(fActiveLimit-nextOutputStringStart), &status);
2123                     } else {
2124                         UText remainingText = UTEXT_INITIALIZER;
2125                         utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart,
2126                                          fActiveLimit-nextOutputStringStart, &status);
2127                         dest[i] = utext_clone(nullptr, &remainingText, true, false, &status);
2128                         utext_close(&remainingText);
2129                     }
2130                 } else {
2131                     UErrorCode lengthStatus = U_ZERO_ERROR;
2132                     int32_t remaining16Length =
2133                         utext_extract(input, nextOutputStringStart, fActiveLimit, nullptr, 0, &lengthStatus);
2134                     char16_t *remainingChars = (char16_t *)uprv_malloc(sizeof(char16_t)*(remaining16Length+1));
2135                     if (remainingChars == nullptr) {
2136                         status = U_MEMORY_ALLOCATION_ERROR;
2137                         break;
2138                     }
2139 
2140                     utext_extract(input, nextOutputStringStart, fActiveLimit, remainingChars, remaining16Length+1, &status);
2141                     if (dest[i]) {
2142                         utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status);
2143                     } else {
2144                         UText remainingText = UTEXT_INITIALIZER;
2145                         utext_openUChars(&remainingText, remainingChars, remaining16Length, &status);
2146                         dest[i] = utext_clone(nullptr, &remainingText, true, false, &status);
2147                         utext_close(&remainingText);
2148                     }
2149 
2150                     uprv_free(remainingChars);
2151                 }
2152             }
2153             break;
2154         }
2155         if (find()) {
2156             // We found another delimiter.  Move everything from where we started looking
2157             //  up until the start of the delimiter into the next output string.
2158             if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) {
2159                 if (dest[i]) {
2160                     utext_replace(dest[i], 0, utext_nativeLength(dest[i]),
2161                                   input->chunkContents+nextOutputStringStart,
2162                                   (int32_t)(fMatchStart-nextOutputStringStart), &status);
2163                 } else {
2164                     UText remainingText = UTEXT_INITIALIZER;
2165                     utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart,
2166                                       fMatchStart-nextOutputStringStart, &status);
2167                     dest[i] = utext_clone(nullptr, &remainingText, true, false, &status);
2168                     utext_close(&remainingText);
2169                 }
2170             } else {
2171                 UErrorCode lengthStatus = U_ZERO_ERROR;
2172                 int32_t remaining16Length = utext_extract(input, nextOutputStringStart, fMatchStart, nullptr, 0, &lengthStatus);
2173                 char16_t *remainingChars = (char16_t *)uprv_malloc(sizeof(char16_t)*(remaining16Length+1));
2174                 if (remainingChars == nullptr) {
2175                     status = U_MEMORY_ALLOCATION_ERROR;
2176                     break;
2177                 }
2178                 utext_extract(input, nextOutputStringStart, fMatchStart, remainingChars, remaining16Length+1, &status);
2179                 if (dest[i]) {
2180                     utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status);
2181                 } else {
2182                     UText remainingText = UTEXT_INITIALIZER;
2183                     utext_openUChars(&remainingText, remainingChars, remaining16Length, &status);
2184                     dest[i] = utext_clone(nullptr, &remainingText, true, false, &status);
2185                     utext_close(&remainingText);
2186                 }
2187 
2188                 uprv_free(remainingChars);
2189             }
2190             nextOutputStringStart = fMatchEnd;
2191 
2192             // If the delimiter pattern has capturing parentheses, the captured
2193             //  text goes out into the next n destination strings.
2194             int32_t groupNum;
2195             for (groupNum=1; groupNum<=numCaptureGroups; groupNum++) {
2196                 if (i >= destCapacity-2) {
2197                     // Never fill the last available output string with capture group text.
2198                     // It will filled with the last field, the remainder of the
2199                     //  unsplit input text.
2200                     break;
2201                 }
2202                 i++;
2203                 dest[i] = utext_extract_replace(fInputText, dest[i],
2204                                                start64(groupNum, status), end64(groupNum, status), &status);
2205             }
2206 
2207             if (nextOutputStringStart == fActiveLimit) {
2208                 // The delimiter was at the end of the string.  We're done, but first
2209                 // we output one last empty string, for the empty field following
2210                 //   the delimiter at the end of input.
2211                 if (i+1 < destCapacity) {
2212                     ++i;
2213                     if (dest[i] == nullptr) {
2214                         dest[i] = utext_openUChars(nullptr, nullptr, 0, &status);
2215                     } else {
2216                         static const char16_t emptyString[] = {(char16_t)0};
2217                         utext_replace(dest[i], 0, utext_nativeLength(dest[i]), emptyString, 0, &status);
2218                     }
2219                 }
2220                 break;
2221 
2222             }
2223         }
2224         else
2225         {
2226             // We ran off the end of the input while looking for the next delimiter.
2227             // All the remaining text goes into the current output string.
2228             if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) {
2229                 if (dest[i]) {
2230                     utext_replace(dest[i], 0, utext_nativeLength(dest[i]),
2231                                   input->chunkContents+nextOutputStringStart,
2232                                   (int32_t)(fActiveLimit-nextOutputStringStart), &status);
2233                 } else {
2234                     UText remainingText = UTEXT_INITIALIZER;
2235                     utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart,
2236                                      fActiveLimit-nextOutputStringStart, &status);
2237                     dest[i] = utext_clone(nullptr, &remainingText, true, false, &status);
2238                     utext_close(&remainingText);
2239                 }
2240             } else {
2241                 UErrorCode lengthStatus = U_ZERO_ERROR;
2242                 int32_t remaining16Length = utext_extract(input, nextOutputStringStart, fActiveLimit, nullptr, 0, &lengthStatus);
2243                 char16_t *remainingChars = (char16_t *)uprv_malloc(sizeof(char16_t)*(remaining16Length+1));
2244                 if (remainingChars == nullptr) {
2245                     status = U_MEMORY_ALLOCATION_ERROR;
2246                     break;
2247                 }
2248 
2249                 utext_extract(input, nextOutputStringStart, fActiveLimit, remainingChars, remaining16Length+1, &status);
2250                 if (dest[i]) {
2251                     utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status);
2252                 } else {
2253                     UText remainingText = UTEXT_INITIALIZER;
2254                     utext_openUChars(&remainingText, remainingChars, remaining16Length, &status);
2255                     dest[i] = utext_clone(nullptr, &remainingText, true, false, &status);
2256                     utext_close(&remainingText);
2257                 }
2258 
2259                 uprv_free(remainingChars);
2260             }
2261             break;
2262         }
2263         if (U_FAILURE(status)) {
2264             break;
2265         }
2266     }   // end of for loop
2267     return i+1;
2268 }
2269 
2270 
2271 //--------------------------------------------------------------------------------
2272 //
2273 //     start
2274 //
2275 //--------------------------------------------------------------------------------
start(UErrorCode & status) const2276 int32_t RegexMatcher::start(UErrorCode &status) const {
2277     return start(0, status);
2278 }
2279 
start64(UErrorCode & status) const2280 int64_t RegexMatcher::start64(UErrorCode &status) const {
2281     return start64(0, status);
2282 }
2283 
2284 //--------------------------------------------------------------------------------
2285 //
2286 //     start(int32_t group, UErrorCode &status)
2287 //
2288 //--------------------------------------------------------------------------------
2289 
start64(int32_t group,UErrorCode & status) const2290 int64_t RegexMatcher::start64(int32_t group, UErrorCode &status) const {
2291     if (U_FAILURE(status)) {
2292         return -1;
2293     }
2294     if (U_FAILURE(fDeferredStatus)) {
2295         status = fDeferredStatus;
2296         return -1;
2297     }
2298     if (fMatch == false) {
2299         status = U_REGEX_INVALID_STATE;
2300         return -1;
2301     }
2302     if (group < 0 || group > fPattern->fGroupMap->size()) {
2303         status = U_INDEX_OUTOFBOUNDS_ERROR;
2304         return -1;
2305     }
2306     int64_t s;
2307     if (group == 0) {
2308         s = fMatchStart;
2309     } else {
2310         int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1);
2311         U_ASSERT(groupOffset < fPattern->fFrameSize);
2312         U_ASSERT(groupOffset >= 0);
2313         s = fFrame->fExtra[groupOffset];
2314     }
2315 
2316     return s;
2317 }
2318 
2319 
start(int32_t group,UErrorCode & status) const2320 int32_t RegexMatcher::start(int32_t group, UErrorCode &status) const {
2321     return (int32_t)start64(group, status);
2322 }
2323 
2324 //--------------------------------------------------------------------------------
2325 //
2326 //     useAnchoringBounds
2327 //
2328 //--------------------------------------------------------------------------------
useAnchoringBounds(UBool b)2329 RegexMatcher &RegexMatcher::useAnchoringBounds(UBool b) {
2330     fAnchoringBounds = b;
2331     fAnchorStart = (fAnchoringBounds ? fRegionStart : 0);
2332     fAnchorLimit = (fAnchoringBounds ? fRegionLimit : fInputLength);
2333     return *this;
2334 }
2335 
2336 
2337 //--------------------------------------------------------------------------------
2338 //
2339 //     useTransparentBounds
2340 //
2341 //--------------------------------------------------------------------------------
useTransparentBounds(UBool b)2342 RegexMatcher &RegexMatcher::useTransparentBounds(UBool b) {
2343     fTransparentBounds = b;
2344     fLookStart = (fTransparentBounds ? 0 : fRegionStart);
2345     fLookLimit = (fTransparentBounds ? fInputLength : fRegionLimit);
2346     return *this;
2347 }
2348 
2349 //--------------------------------------------------------------------------------
2350 //
2351 //     setTimeLimit
2352 //
2353 //--------------------------------------------------------------------------------
setTimeLimit(int32_t limit,UErrorCode & status)2354 void RegexMatcher::setTimeLimit(int32_t limit, UErrorCode &status) {
2355     if (U_FAILURE(status)) {
2356         return;
2357     }
2358     if (U_FAILURE(fDeferredStatus)) {
2359         status = fDeferredStatus;
2360         return;
2361     }
2362     if (limit < 0) {
2363         status = U_ILLEGAL_ARGUMENT_ERROR;
2364         return;
2365     }
2366     fTimeLimit = limit;
2367 }
2368 
2369 
2370 //--------------------------------------------------------------------------------
2371 //
2372 //     getTimeLimit
2373 //
2374 //--------------------------------------------------------------------------------
getTimeLimit() const2375 int32_t RegexMatcher::getTimeLimit() const {
2376     return fTimeLimit;
2377 }
2378 
2379 
2380 //--------------------------------------------------------------------------------
2381 //
2382 //     setStackLimit
2383 //
2384 //--------------------------------------------------------------------------------
setStackLimit(int32_t limit,UErrorCode & status)2385 void RegexMatcher::setStackLimit(int32_t limit, UErrorCode &status) {
2386     if (U_FAILURE(status)) {
2387         return;
2388     }
2389     if (U_FAILURE(fDeferredStatus)) {
2390         status = fDeferredStatus;
2391         return;
2392     }
2393     if (limit < 0) {
2394         status = U_ILLEGAL_ARGUMENT_ERROR;
2395         return;
2396     }
2397 
2398     // Reset the matcher.  This is needed here in case there is a current match
2399     //    whose final stack frame (containing the match results, pointed to by fFrame)
2400     //    would be lost by resizing to a smaller stack size.
2401     reset();
2402 
2403     if (limit == 0) {
2404         // Unlimited stack expansion
2405         fStack->setMaxCapacity(0);
2406     } else {
2407         // Change the units of the limit  from bytes to ints, and bump the size up
2408         //   to be big enough to hold at least one stack frame for the pattern,
2409         //   if it isn't there already.
2410         int32_t adjustedLimit = limit / sizeof(int32_t);
2411         if (adjustedLimit < fPattern->fFrameSize) {
2412             adjustedLimit = fPattern->fFrameSize;
2413         }
2414         fStack->setMaxCapacity(adjustedLimit);
2415     }
2416     fStackLimit = limit;
2417 }
2418 
2419 
2420 //--------------------------------------------------------------------------------
2421 //
2422 //     getStackLimit
2423 //
2424 //--------------------------------------------------------------------------------
getStackLimit() const2425 int32_t RegexMatcher::getStackLimit() const {
2426     return fStackLimit;
2427 }
2428 
2429 
2430 //--------------------------------------------------------------------------------
2431 //
2432 //     setMatchCallback
2433 //
2434 //--------------------------------------------------------------------------------
setMatchCallback(URegexMatchCallback * callback,const void * context,UErrorCode & status)2435 void RegexMatcher::setMatchCallback(URegexMatchCallback     *callback,
2436                                     const void              *context,
2437                                     UErrorCode              &status) {
2438     if (U_FAILURE(status)) {
2439         return;
2440     }
2441     fCallbackFn = callback;
2442     fCallbackContext = context;
2443 }
2444 
2445 
2446 //--------------------------------------------------------------------------------
2447 //
2448 //     getMatchCallback
2449 //
2450 //--------------------------------------------------------------------------------
getMatchCallback(URegexMatchCallback * & callback,const void * & context,UErrorCode & status)2451 void RegexMatcher::getMatchCallback(URegexMatchCallback   *&callback,
2452                                   const void              *&context,
2453                                   UErrorCode              &status) {
2454     if (U_FAILURE(status)) {
2455        return;
2456     }
2457     callback = fCallbackFn;
2458     context  = fCallbackContext;
2459 }
2460 
2461 
2462 //--------------------------------------------------------------------------------
2463 //
2464 //     setMatchCallback
2465 //
2466 //--------------------------------------------------------------------------------
setFindProgressCallback(URegexFindProgressCallback * callback,const void * context,UErrorCode & status)2467 void RegexMatcher::setFindProgressCallback(URegexFindProgressCallback      *callback,
2468                                                 const void                      *context,
2469                                                 UErrorCode                      &status) {
2470     if (U_FAILURE(status)) {
2471         return;
2472     }
2473     fFindProgressCallbackFn = callback;
2474     fFindProgressCallbackContext = context;
2475 }
2476 
2477 
2478 //--------------------------------------------------------------------------------
2479 //
2480 //     getMatchCallback
2481 //
2482 //--------------------------------------------------------------------------------
getFindProgressCallback(URegexFindProgressCallback * & callback,const void * & context,UErrorCode & status)2483 void RegexMatcher::getFindProgressCallback(URegexFindProgressCallback    *&callback,
2484                                                 const void                    *&context,
2485                                                 UErrorCode                    &status) {
2486     if (U_FAILURE(status)) {
2487        return;
2488     }
2489     callback = fFindProgressCallbackFn;
2490     context  = fFindProgressCallbackContext;
2491 }
2492 
2493 
2494 //================================================================================
2495 //
2496 //    Code following this point in this file is the internal
2497 //    Match Engine Implementation.
2498 //
2499 //================================================================================
2500 
2501 
2502 //--------------------------------------------------------------------------------
2503 //
2504 //   resetStack
2505 //           Discard any previous contents of the state save stack, and initialize a
2506 //           new stack frame to all -1.  The -1s are needed for capture group limits,
2507 //           where they indicate that a group has not yet matched anything.
2508 //--------------------------------------------------------------------------------
resetStack()2509 REStackFrame *RegexMatcher::resetStack() {
2510     // Discard any previous contents of the state save stack, and initialize a
2511     //  new stack frame with all -1 data.  The -1s are needed for capture group limits,
2512     //  where they indicate that a group has not yet matched anything.
2513     fStack->removeAllElements();
2514 
2515     REStackFrame *iFrame = (REStackFrame *)fStack->reserveBlock(fPattern->fFrameSize, fDeferredStatus);
2516     if(U_FAILURE(fDeferredStatus)) {
2517         return nullptr;
2518     }
2519 
2520     int32_t i;
2521     for (i=0; i<fPattern->fFrameSize-RESTACKFRAME_HDRCOUNT; i++) {
2522         iFrame->fExtra[i] = -1;
2523     }
2524     return iFrame;
2525 }
2526 
2527 
2528 
2529 //--------------------------------------------------------------------------------
2530 //
2531 //   isWordBoundary
2532 //                     in perl, "xab..cd..", \b is true at positions 0,3,5,7
2533 //                     For us,
2534 //                       If the current char is a combining mark,
2535 //                          \b is false.
2536 //                       Else Scan backwards to the first non-combining char.
2537 //                            We are at a boundary if the this char and the original chars are
2538 //                               opposite in membership in \w set
2539 //
2540 //          parameters:   pos   - the current position in the input buffer
2541 //
2542 //              TODO:  double-check edge cases at region boundaries.
2543 //
2544 //--------------------------------------------------------------------------------
isWordBoundary(int64_t pos)2545 UBool RegexMatcher::isWordBoundary(int64_t pos) {
2546     UBool isBoundary = false;
2547     UBool cIsWord    = false;
2548 
2549     if (pos >= fLookLimit) {
2550         fHitEnd = true;
2551     } else {
2552         // Determine whether char c at current position is a member of the word set of chars.
2553         // If we're off the end of the string, behave as though we're not at a word char.
2554         UTEXT_SETNATIVEINDEX(fInputText, pos);
2555         UChar32  c = UTEXT_CURRENT32(fInputText);
2556         if (u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND) || u_charType(c) == U_FORMAT_CHAR) {
2557             // Current char is a combining one.  Not a boundary.
2558             return false;
2559         }
2560         cIsWord = RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET].contains(c);
2561     }
2562 
2563     // Back up until we come to a non-combining char, determine whether
2564     //  that char is a word char.
2565     UBool prevCIsWord = false;
2566     for (;;) {
2567         if (UTEXT_GETNATIVEINDEX(fInputText) <= fLookStart) {
2568             break;
2569         }
2570         UChar32 prevChar = UTEXT_PREVIOUS32(fInputText);
2571         if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND)
2572               || u_charType(prevChar) == U_FORMAT_CHAR)) {
2573             prevCIsWord = RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET].contains(prevChar);
2574             break;
2575         }
2576     }
2577     isBoundary = cIsWord ^ prevCIsWord;
2578     return isBoundary;
2579 }
2580 
isChunkWordBoundary(int32_t pos)2581 UBool RegexMatcher::isChunkWordBoundary(int32_t pos) {
2582     UBool isBoundary = false;
2583     UBool cIsWord    = false;
2584 
2585     const char16_t *inputBuf = fInputText->chunkContents;
2586 
2587     if (pos >= fLookLimit) {
2588         fHitEnd = true;
2589     } else {
2590         // Determine whether char c at current position is a member of the word set of chars.
2591         // If we're off the end of the string, behave as though we're not at a word char.
2592         UChar32 c;
2593         U16_GET(inputBuf, fLookStart, pos, fLookLimit, c);
2594         if (u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND) || u_charType(c) == U_FORMAT_CHAR) {
2595             // Current char is a combining one.  Not a boundary.
2596             return false;
2597         }
2598         cIsWord = RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET].contains(c);
2599     }
2600 
2601     // Back up until we come to a non-combining char, determine whether
2602     //  that char is a word char.
2603     UBool prevCIsWord = false;
2604     for (;;) {
2605         if (pos <= fLookStart) {
2606             break;
2607         }
2608         UChar32 prevChar;
2609         U16_PREV(inputBuf, fLookStart, pos, prevChar);
2610         if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND)
2611               || u_charType(prevChar) == U_FORMAT_CHAR)) {
2612             prevCIsWord = RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET].contains(prevChar);
2613             break;
2614         }
2615     }
2616     isBoundary = cIsWord ^ prevCIsWord;
2617     return isBoundary;
2618 }
2619 
2620 //--------------------------------------------------------------------------------
2621 //
2622 //   isUWordBoundary
2623 //
2624 //         Test for a word boundary using RBBI word break.
2625 //
2626 //          parameters:   pos   - the current position in the input buffer
2627 //
2628 //--------------------------------------------------------------------------------
isUWordBoundary(int64_t pos,UErrorCode & status)2629 UBool RegexMatcher::isUWordBoundary(int64_t pos, UErrorCode &status) {
2630     UBool       returnVal = false;
2631 
2632 #if UCONFIG_NO_BREAK_ITERATION==0
2633     // Note: this point will never be reached if break iteration is configured out.
2634     //       Regex patterns that would require this function will fail to compile.
2635 
2636     // If we haven't yet created a break iterator for this matcher, do it now.
2637     if (fWordBreakItr == nullptr) {
2638         fWordBreakItr = BreakIterator::createWordInstance(Locale::getEnglish(), status);
2639         if (U_FAILURE(status)) {
2640             return false;
2641         }
2642         fWordBreakItr->setText(fInputText, status);
2643     }
2644 
2645     // Note: zero width boundary tests like \b see through transparent region bounds,
2646     //       which is why fLookLimit is used here, rather than fActiveLimit.
2647     if (pos >= fLookLimit) {
2648         fHitEnd = true;
2649         returnVal = true;   // With Unicode word rules, only positions within the interior of "real"
2650                             //    words are not boundaries.  All non-word chars stand by themselves,
2651                             //    with word boundaries on both sides.
2652     } else {
2653         returnVal = fWordBreakItr->isBoundary((int32_t)pos);
2654     }
2655 #endif
2656     return   returnVal;
2657 }
2658 
2659 
followingGCBoundary(int64_t pos,UErrorCode & status)2660 int64_t RegexMatcher::followingGCBoundary(int64_t pos, UErrorCode &status) {
2661     int64_t result = pos;
2662 
2663 #if UCONFIG_NO_BREAK_ITERATION==0
2664     // Note: this point will never be reached if break iteration is configured out.
2665     //       Regex patterns that would require this function will fail to compile.
2666 
2667     // If we haven't yet created a break iterator for this matcher, do it now.
2668     if (fGCBreakItr == nullptr) {
2669         fGCBreakItr = BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
2670         if (U_FAILURE(status)) {
2671             return pos;
2672         }
2673         fGCBreakItr->setText(fInputText, status);
2674     }
2675     result = fGCBreakItr->following(pos);
2676     if (result == BreakIterator::DONE) {
2677         result = pos;
2678     }
2679 #endif
2680     return result;
2681 }
2682 
2683 //--------------------------------------------------------------------------------
2684 //
2685 //   IncrementTime     This function is called once each TIMER_INITIAL_VALUE state
2686 //                     saves. Increment the "time" counter, and call the
2687 //                     user callback function if there is one installed.
2688 //
2689 //                     If the match operation needs to be aborted, either for a time-out
2690 //                     or because the user callback asked for it, just set an error status.
2691 //                     The engine will pick that up and stop in its outer loop.
2692 //
2693 //--------------------------------------------------------------------------------
IncrementTime(UErrorCode & status)2694 void RegexMatcher::IncrementTime(UErrorCode &status) {
2695     fTickCounter = TIMER_INITIAL_VALUE;
2696     fTime++;
2697     if (fCallbackFn != nullptr) {
2698         if ((*fCallbackFn)(fCallbackContext, fTime) == false) {
2699             status = U_REGEX_STOPPED_BY_CALLER;
2700             return;
2701         }
2702     }
2703     if (fTimeLimit > 0 && fTime >= fTimeLimit) {
2704         status = U_REGEX_TIME_OUT;
2705     }
2706 }
2707 
2708 //--------------------------------------------------------------------------------
2709 //
2710 //   StateSave
2711 //       Make a new stack frame, initialized as a copy of the current stack frame.
2712 //       Set the pattern index in the original stack frame from the operand value
2713 //       in the opcode.  Execution of the engine continues with the state in
2714 //       the newly created stack frame
2715 //
2716 //       Note that reserveBlock() may grow the stack, resulting in the
2717 //       whole thing being relocated in memory.
2718 //
2719 //    Parameters:
2720 //       fp           The top frame pointer when called.  At return, a new
2721 //                    fame will be present
2722 //       savePatIdx   An index into the compiled pattern.  Goes into the original
2723 //                    (not new) frame.  If execution ever back-tracks out of the
2724 //                    new frame, this will be where we continue from in the pattern.
2725 //    Return
2726 //                    The new frame pointer.
2727 //
2728 //--------------------------------------------------------------------------------
StateSave(REStackFrame * fp,int64_t savePatIdx,UErrorCode & status)2729 inline REStackFrame *RegexMatcher::StateSave(REStackFrame *fp, int64_t savePatIdx, UErrorCode &status) {
2730     if (U_FAILURE(status)) {
2731         return fp;
2732     }
2733     // push storage for a new frame.
2734     int64_t *newFP = fStack->reserveBlock(fFrameSize, status);
2735     if (U_FAILURE(status)) {
2736         // Failure on attempted stack expansion.
2737         //   Stack function set some other error code, change it to a more
2738         //   specific one for regular expressions.
2739         status = U_REGEX_STACK_OVERFLOW;
2740         // We need to return a writable stack frame, so just return the
2741         //    previous frame.  The match operation will stop quickly
2742         //    because of the error status, after which the frame will never
2743         //    be looked at again.
2744         return fp;
2745     }
2746     fp = (REStackFrame *)(newFP - fFrameSize);  // in case of realloc of stack.
2747 
2748     // New stack frame = copy of old top frame.
2749     int64_t *source = (int64_t *)fp;
2750     int64_t *dest   = newFP;
2751     for (;;) {
2752         *dest++ = *source++;
2753         if (source == newFP) {
2754             break;
2755         }
2756     }
2757 
2758     fTickCounter--;
2759     if (fTickCounter <= 0) {
2760        IncrementTime(status);    // Re-initializes fTickCounter
2761     }
2762     fp->fPatIdx = savePatIdx;
2763     return (REStackFrame *)newFP;
2764 }
2765 
2766 #if defined(REGEX_DEBUG)
2767 namespace {
StringFromUText(UText * ut)2768 UnicodeString StringFromUText(UText *ut) {
2769     UnicodeString result;
2770     for (UChar32 c = utext_next32From(ut, 0); c != U_SENTINEL; c = UTEXT_NEXT32(ut)) {
2771         result.append(c);
2772     }
2773     return result;
2774 }
2775 }
2776 #endif // REGEX_DEBUG
2777 
2778 
2779 //--------------------------------------------------------------------------------
2780 //
2781 //   MatchAt      This is the actual matching engine.
2782 //
2783 //                  startIdx:    begin matching a this index.
2784 //                  toEnd:       if true, match must extend to end of the input region
2785 //
2786 //--------------------------------------------------------------------------------
MatchAt(int64_t startIdx,UBool toEnd,UErrorCode & status)2787 void RegexMatcher::MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status) {
2788     UBool       isMatch  = false;      // True if the we have a match.
2789 
2790     int64_t     backSearchIndex = U_INT64_MAX; // used after greedy single-character matches for searching backwards
2791 
2792     int32_t     op;                    // Operation from the compiled pattern, split into
2793     int32_t     opType;                //    the opcode
2794     int32_t     opValue;               //    and the operand value.
2795 
2796 #ifdef REGEX_RUN_DEBUG
2797     if (fTraceDebug) {
2798         printf("MatchAt(startIdx=%ld)\n", startIdx);
2799         printf("Original Pattern: \"%s\"\n", CStr(StringFromUText(fPattern->fPattern))());
2800         printf("Input String:     \"%s\"\n\n", CStr(StringFromUText(fInputText))());
2801     }
2802 #endif
2803 
2804     if (U_FAILURE(status)) {
2805         return;
2806     }
2807 
2808     //  Cache frequently referenced items from the compiled pattern
2809     //
2810     int64_t             *pat           = fPattern->fCompiledPat->getBuffer();
2811 
2812     const char16_t      *litText       = fPattern->fLiteralText.getBuffer();
2813     UVector             *fSets         = fPattern->fSets;
2814 
2815     fFrameSize = fPattern->fFrameSize;
2816     REStackFrame        *fp            = resetStack();
2817     if (U_FAILURE(fDeferredStatus)) {
2818         status = fDeferredStatus;
2819         return;
2820     }
2821 
2822     fp->fPatIdx   = 0;
2823     fp->fInputIdx = startIdx;
2824 
2825     // Zero out the pattern's static data
2826     int32_t i;
2827     for (i = 0; i<fPattern->fDataSize; i++) {
2828         fData[i] = 0;
2829     }
2830 
2831     //
2832     //  Main loop for interpreting the compiled pattern.
2833     //  One iteration of the loop per pattern operation performed.
2834     //
2835     for (;;) {
2836         op      = (int32_t)pat[fp->fPatIdx];
2837         opType  = URX_TYPE(op);
2838         opValue = URX_VAL(op);
2839 #ifdef REGEX_RUN_DEBUG
2840         if (fTraceDebug) {
2841             UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2842             printf("inputIdx=%ld   inputChar=%x   sp=%3ld   activeLimit=%ld  ", fp->fInputIdx,
2843                 UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(), fActiveLimit);
2844             fPattern->dumpOp(fp->fPatIdx);
2845         }
2846 #endif
2847         fp->fPatIdx++;
2848 
2849         switch (opType) {
2850 
2851 
2852         case URX_NOP:
2853             break;
2854 
2855 
2856         case URX_BACKTRACK:
2857             // Force a backtrack.  In some circumstances, the pattern compiler
2858             //   will notice that the pattern can't possibly match anything, and will
2859             //   emit one of these at that point.
2860             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2861             break;
2862 
2863 
2864         case URX_ONECHAR:
2865             if (fp->fInputIdx < fActiveLimit) {
2866                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2867                 UChar32 c = UTEXT_NEXT32(fInputText);
2868                 if (c == opValue) {
2869                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
2870                     break;
2871                 }
2872             } else {
2873                 fHitEnd = true;
2874             }
2875             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2876             break;
2877 
2878 
2879         case URX_STRING:
2880             {
2881                 // Test input against a literal string.
2882                 // Strings require two slots in the compiled pattern, one for the
2883                 //   offset to the string text, and one for the length.
2884 
2885                 int32_t   stringStartIdx = opValue;
2886                 op      = (int32_t)pat[fp->fPatIdx];     // Fetch the second operand
2887                 fp->fPatIdx++;
2888                 opType    = URX_TYPE(op);
2889                 int32_t stringLen = URX_VAL(op);
2890                 U_ASSERT(opType == URX_STRING_LEN);
2891                 U_ASSERT(stringLen >= 2);
2892 
2893                 const char16_t *patternString = litText+stringStartIdx;
2894                 int32_t patternStringIndex = 0;
2895                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2896                 UChar32 inputChar;
2897                 UChar32 patternChar;
2898                 UBool success = true;
2899                 while (patternStringIndex < stringLen) {
2900                     if (UTEXT_GETNATIVEINDEX(fInputText) >= fActiveLimit) {
2901                         success = false;
2902                         fHitEnd = true;
2903                         break;
2904                     }
2905                     inputChar = UTEXT_NEXT32(fInputText);
2906                     U16_NEXT(patternString, patternStringIndex, stringLen, patternChar);
2907                     if (patternChar != inputChar) {
2908                         success = false;
2909                         break;
2910                     }
2911                 }
2912 
2913                 if (success) {
2914                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
2915                 } else {
2916                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2917                 }
2918             }
2919             break;
2920 
2921 
2922         case URX_STATE_SAVE:
2923             fp = StateSave(fp, opValue, status);
2924             break;
2925 
2926 
2927         case URX_END:
2928             // The match loop will exit via this path on a successful match,
2929             //   when we reach the end of the pattern.
2930             if (toEnd && fp->fInputIdx != fActiveLimit) {
2931                 // The pattern matched, but not to the end of input.  Try some more.
2932                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2933                 break;
2934             }
2935             isMatch = true;
2936             goto  breakFromLoop;
2937 
2938         // Start and End Capture stack frame variables are laid out out like this:
2939             //  fp->fExtra[opValue]  - The start of a completed capture group
2940             //             opValue+1 - The end   of a completed capture group
2941             //             opValue+2 - the start of a capture group whose end
2942             //                          has not yet been reached (and might not ever be).
2943         case URX_START_CAPTURE:
2944             U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
2945             fp->fExtra[opValue+2] = fp->fInputIdx;
2946             break;
2947 
2948 
2949         case URX_END_CAPTURE:
2950             U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
2951             U_ASSERT(fp->fExtra[opValue+2] >= 0);            // Start pos for this group must be set.
2952             fp->fExtra[opValue]   = fp->fExtra[opValue+2];   // Tentative start becomes real.
2953             fp->fExtra[opValue+1] = fp->fInputIdx;           // End position
2954             U_ASSERT(fp->fExtra[opValue] <= fp->fExtra[opValue+1]);
2955             break;
2956 
2957 
2958         case URX_DOLLAR:                   //  $, test for End of line
2959                                            //     or for position before new line at end of input
2960             {
2961                 if (fp->fInputIdx >= fAnchorLimit) {
2962                     // We really are at the end of input.  Success.
2963                     fHitEnd = true;
2964                     fRequireEnd = true;
2965                     break;
2966                 }
2967 
2968                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2969 
2970                 // If we are positioned just before a new-line that is located at the
2971                 //   end of input, succeed.
2972                 UChar32 c = UTEXT_NEXT32(fInputText);
2973                 if (UTEXT_GETNATIVEINDEX(fInputText) >= fAnchorLimit) {
2974                     if (isLineTerminator(c)) {
2975                         // If not in the middle of a CR/LF sequence
2976                         if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && ((void)UTEXT_PREVIOUS32(fInputText), UTEXT_PREVIOUS32(fInputText))==0x0d)) {
2977                             // At new-line at end of input. Success
2978                             fHitEnd = true;
2979                             fRequireEnd = true;
2980 
2981                             break;
2982                         }
2983                     }
2984                 } else {
2985                     UChar32 nextC = UTEXT_NEXT32(fInputText);
2986                     if (c == 0x0d && nextC == 0x0a && UTEXT_GETNATIVEINDEX(fInputText) >= fAnchorLimit) {
2987                         fHitEnd = true;
2988                         fRequireEnd = true;
2989                         break;                         // At CR/LF at end of input.  Success
2990                     }
2991                 }
2992 
2993                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2994             }
2995             break;
2996 
2997 
2998          case URX_DOLLAR_D:                   //  $, test for End of Line, in UNIX_LINES mode.
2999             if (fp->fInputIdx >= fAnchorLimit) {
3000                 // Off the end of input.  Success.
3001                 fHitEnd = true;
3002                 fRequireEnd = true;
3003                 break;
3004             } else {
3005                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3006                 UChar32 c = UTEXT_NEXT32(fInputText);
3007                 // Either at the last character of input, or off the end.
3008                 if (c == 0x0a && UTEXT_GETNATIVEINDEX(fInputText) == fAnchorLimit) {
3009                     fHitEnd = true;
3010                     fRequireEnd = true;
3011                     break;
3012                 }
3013             }
3014 
3015             // Not at end of input.  Back-track out.
3016             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3017             break;
3018 
3019 
3020          case URX_DOLLAR_M:                //  $, test for End of line in multi-line mode
3021              {
3022                  if (fp->fInputIdx >= fAnchorLimit) {
3023                      // We really are at the end of input.  Success.
3024                      fHitEnd = true;
3025                      fRequireEnd = true;
3026                      break;
3027                  }
3028                  // If we are positioned just before a new-line, succeed.
3029                  // It makes no difference where the new-line is within the input.
3030                  UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3031                  UChar32 c = UTEXT_CURRENT32(fInputText);
3032                  if (isLineTerminator(c)) {
3033                      // At a line end, except for the odd chance of  being in the middle of a CR/LF sequence
3034                      //  In multi-line mode, hitting a new-line just before the end of input does not
3035                      //   set the hitEnd or requireEnd flags
3036                      if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && UTEXT_PREVIOUS32(fInputText)==0x0d)) {
3037                         break;
3038                      }
3039                  }
3040                  // not at a new line.  Fail.
3041                  fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3042              }
3043              break;
3044 
3045 
3046          case URX_DOLLAR_MD:                //  $, test for End of line in multi-line and UNIX_LINES mode
3047              {
3048                  if (fp->fInputIdx >= fAnchorLimit) {
3049                      // We really are at the end of input.  Success.
3050                      fHitEnd = true;
3051                      fRequireEnd = true;  // Java set requireEnd in this case, even though
3052                      break;               //   adding a new-line would not lose the match.
3053                  }
3054                  // If we are not positioned just before a new-line, the test fails; backtrack out.
3055                  // It makes no difference where the new-line is within the input.
3056                  UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3057                  if (UTEXT_CURRENT32(fInputText) != 0x0a) {
3058                      fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3059                  }
3060              }
3061              break;
3062 
3063 
3064        case URX_CARET:                    //  ^, test for start of line
3065             if (fp->fInputIdx != fAnchorStart) {
3066                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3067             }
3068             break;
3069 
3070 
3071        case URX_CARET_M:                   //  ^, test for start of line in mulit-line mode
3072            {
3073                if (fp->fInputIdx == fAnchorStart) {
3074                    // We are at the start input.  Success.
3075                    break;
3076                }
3077                // Check whether character just before the current pos is a new-line
3078                //   unless we are at the end of input
3079                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3080                UChar32  c = UTEXT_PREVIOUS32(fInputText);
3081                if ((fp->fInputIdx < fAnchorLimit) && isLineTerminator(c)) {
3082                    //  It's a new-line.  ^ is true.  Success.
3083                    //  TODO:  what should be done with positions between a CR and LF?
3084                    break;
3085                }
3086                // Not at the start of a line.  Fail.
3087                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3088            }
3089            break;
3090 
3091 
3092        case URX_CARET_M_UNIX:       //  ^, test for start of line in mulit-line + Unix-line mode
3093            {
3094                U_ASSERT(fp->fInputIdx >= fAnchorStart);
3095                if (fp->fInputIdx <= fAnchorStart) {
3096                    // We are at the start input.  Success.
3097                    break;
3098                }
3099                // Check whether character just before the current pos is a new-line
3100                U_ASSERT(fp->fInputIdx <= fAnchorLimit);
3101                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3102                UChar32  c = UTEXT_PREVIOUS32(fInputText);
3103                if (c != 0x0a) {
3104                    // Not at the start of a line.  Back-track out.
3105                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3106                }
3107            }
3108            break;
3109 
3110         case URX_BACKSLASH_B:          // Test for word boundaries
3111             {
3112                 UBool success = isWordBoundary(fp->fInputIdx);
3113                 success ^= (UBool)(opValue != 0);     // flip sense for \B
3114                 if (!success) {
3115                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3116                 }
3117             }
3118             break;
3119 
3120 
3121         case URX_BACKSLASH_BU:          // Test for word boundaries, Unicode-style
3122             {
3123                 UBool success = isUWordBoundary(fp->fInputIdx, status);
3124                 success ^= (UBool)(opValue != 0);     // flip sense for \B
3125                 if (!success) {
3126                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3127                 }
3128             }
3129             break;
3130 
3131 
3132         case URX_BACKSLASH_D:            // Test for decimal digit
3133             {
3134                 if (fp->fInputIdx >= fActiveLimit) {
3135                     fHitEnd = true;
3136                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3137                     break;
3138                 }
3139 
3140                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3141 
3142                 UChar32 c = UTEXT_NEXT32(fInputText);
3143                 int8_t ctype = u_charType(c);     // TODO:  make a unicode set for this.  Will be faster.
3144                 UBool success = (ctype == U_DECIMAL_DIGIT_NUMBER);
3145                 success ^= (UBool)(opValue != 0);        // flip sense for \D
3146                 if (success) {
3147                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3148                 } else {
3149                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3150                 }
3151             }
3152             break;
3153 
3154 
3155         case URX_BACKSLASH_G:          // Test for position at end of previous match
3156             if (!((fMatch && fp->fInputIdx==fMatchEnd) || (fMatch==false && fp->fInputIdx==fActiveStart))) {
3157                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3158             }
3159             break;
3160 
3161 
3162         case URX_BACKSLASH_H:            // Test for \h, horizontal white space.
3163             {
3164                 if (fp->fInputIdx >= fActiveLimit) {
3165                     fHitEnd = true;
3166                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3167                     break;
3168                 }
3169                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3170                 UChar32 c = UTEXT_NEXT32(fInputText);
3171                 int8_t ctype = u_charType(c);
3172                 UBool success = (ctype == U_SPACE_SEPARATOR || c == 9);  // SPACE_SEPARATOR || TAB
3173                 success ^= (UBool)(opValue != 0);        // flip sense for \H
3174                 if (success) {
3175                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3176                 } else {
3177                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3178                 }
3179             }
3180             break;
3181 
3182 
3183         case URX_BACKSLASH_R:            // Test for \R, any line break sequence.
3184             {
3185                 if (fp->fInputIdx >= fActiveLimit) {
3186                     fHitEnd = true;
3187                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3188                     break;
3189                 }
3190                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3191                 UChar32 c = UTEXT_NEXT32(fInputText);
3192                 if (isLineTerminator(c)) {
3193                     if (c == 0x0d && utext_current32(fInputText) == 0x0a) {
3194                         utext_next32(fInputText);
3195                     }
3196                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3197                 } else {
3198                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3199                 }
3200             }
3201             break;
3202 
3203 
3204         case URX_BACKSLASH_V:            // \v, any single line ending character.
3205             {
3206                 if (fp->fInputIdx >= fActiveLimit) {
3207                     fHitEnd = true;
3208                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3209                     break;
3210                 }
3211                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3212                 UChar32 c = UTEXT_NEXT32(fInputText);
3213                 UBool success = isLineTerminator(c);
3214                 success ^= (UBool)(opValue != 0);        // flip sense for \V
3215                 if (success) {
3216                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3217                 } else {
3218                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3219                 }
3220             }
3221             break;
3222 
3223 
3224         case URX_BACKSLASH_X:
3225             //  Match a Grapheme, as defined by Unicode UAX 29.
3226 
3227             // Fail if at end of input
3228             if (fp->fInputIdx >= fActiveLimit) {
3229                 fHitEnd = true;
3230                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3231                 break;
3232             }
3233 
3234             fp->fInputIdx = followingGCBoundary(fp->fInputIdx, status);
3235             if (fp->fInputIdx >= fActiveLimit) {
3236                 fHitEnd = true;
3237                 fp->fInputIdx = fActiveLimit;
3238             }
3239             break;
3240 
3241 
3242         case URX_BACKSLASH_Z:          // Test for end of Input
3243             if (fp->fInputIdx < fAnchorLimit) {
3244                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3245             } else {
3246                 fHitEnd = true;
3247                 fRequireEnd = true;
3248             }
3249             break;
3250 
3251 
3252 
3253         case URX_STATIC_SETREF:
3254             {
3255                 // Test input character against one of the predefined sets
3256                 //    (Word Characters, for example)
3257                 // The high bit of the op value is a flag for the match polarity.
3258                 //    0:   success if input char is in set.
3259                 //    1:   success if input char is not in set.
3260                 if (fp->fInputIdx >= fActiveLimit) {
3261                     fHitEnd = true;
3262                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3263                     break;
3264                 }
3265 
3266                 UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET);
3267                 opValue &= ~URX_NEG_SET;
3268                 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
3269 
3270                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3271                 UChar32 c = UTEXT_NEXT32(fInputText);
3272                 if (c < 256) {
3273                     Regex8BitSet &s8 = RegexStaticSets::gStaticSets->fPropSets8[opValue];
3274                     if (s8.contains(c)) {
3275                         success = !success;
3276                     }
3277                 } else {
3278                     const UnicodeSet &s = RegexStaticSets::gStaticSets->fPropSets[opValue];
3279                     if (s.contains(c)) {
3280                         success = !success;
3281                     }
3282                 }
3283                 if (success) {
3284                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3285                 } else {
3286                     // the character wasn't in the set.
3287                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3288                 }
3289             }
3290             break;
3291 
3292 
3293         case URX_STAT_SETREF_N:
3294             {
3295                 // Test input character for NOT being a member of  one of
3296                 //    the predefined sets (Word Characters, for example)
3297                 if (fp->fInputIdx >= fActiveLimit) {
3298                     fHitEnd = true;
3299                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3300                     break;
3301                 }
3302 
3303                 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
3304 
3305                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3306 
3307                 UChar32 c = UTEXT_NEXT32(fInputText);
3308                 if (c < 256) {
3309                     Regex8BitSet &s8 = RegexStaticSets::gStaticSets->fPropSets8[opValue];
3310                     if (s8.contains(c) == false) {
3311                         fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3312                         break;
3313                     }
3314                 } else {
3315                     const UnicodeSet &s = RegexStaticSets::gStaticSets->fPropSets[opValue];
3316                     if (s.contains(c) == false) {
3317                         fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3318                         break;
3319                     }
3320                 }
3321                 // the character wasn't in the set.
3322                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3323             }
3324             break;
3325 
3326 
3327         case URX_SETREF:
3328             if (fp->fInputIdx >= fActiveLimit) {
3329                 fHitEnd = true;
3330                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3331                 break;
3332             } else {
3333                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3334 
3335                 // There is input left.  Pick up one char and test it for set membership.
3336                 UChar32 c = UTEXT_NEXT32(fInputText);
3337                 U_ASSERT(opValue > 0 && opValue < fSets->size());
3338                 if (c<256) {
3339                     Regex8BitSet *s8 = &fPattern->fSets8[opValue];
3340                     if (s8->contains(c)) {
3341                         fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3342                         break;
3343                     }
3344                 } else {
3345                     UnicodeSet *s = (UnicodeSet *)fSets->elementAt(opValue);
3346                     if (s->contains(c)) {
3347                         // The character is in the set.  A Match.
3348                         fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3349                         break;
3350                     }
3351                 }
3352 
3353                 // the character wasn't in the set.
3354                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3355             }
3356             break;
3357 
3358 
3359         case URX_DOTANY:
3360             {
3361                 // . matches anything, but stops at end-of-line.
3362                 if (fp->fInputIdx >= fActiveLimit) {
3363                     // At end of input.  Match failed.  Backtrack out.
3364                     fHitEnd = true;
3365                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3366                     break;
3367                 }
3368 
3369                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3370 
3371                 // There is input left.  Advance over one char, unless we've hit end-of-line
3372                 UChar32 c = UTEXT_NEXT32(fInputText);
3373                 if (isLineTerminator(c)) {
3374                     // End of line in normal mode.   . does not match.
3375                         fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3376                     break;
3377                 }
3378                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3379             }
3380             break;
3381 
3382 
3383         case URX_DOTANY_ALL:
3384             {
3385                 // ., in dot-matches-all (including new lines) mode
3386                 if (fp->fInputIdx >= fActiveLimit) {
3387                     // At end of input.  Match failed.  Backtrack out.
3388                     fHitEnd = true;
3389                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3390                     break;
3391                 }
3392 
3393                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3394 
3395                 // There is input left.  Advance over one char, except if we are
3396                 //   at a cr/lf, advance over both of them.
3397                 UChar32 c;
3398                 c = UTEXT_NEXT32(fInputText);
3399                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3400                 if (c==0x0d && fp->fInputIdx < fActiveLimit) {
3401                     // In the case of a CR/LF, we need to advance over both.
3402                     UChar32 nextc = UTEXT_CURRENT32(fInputText);
3403                     if (nextc == 0x0a) {
3404                         (void)UTEXT_NEXT32(fInputText);
3405                         fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3406                     }
3407                 }
3408             }
3409             break;
3410 
3411 
3412         case URX_DOTANY_UNIX:
3413             {
3414                 // '.' operator, matches all, but stops at end-of-line.
3415                 //   UNIX_LINES mode, so 0x0a is the only recognized line ending.
3416                 if (fp->fInputIdx >= fActiveLimit) {
3417                     // At end of input.  Match failed.  Backtrack out.
3418                     fHitEnd = true;
3419                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3420                     break;
3421                 }
3422 
3423                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3424 
3425                 // There is input left.  Advance over one char, unless we've hit end-of-line
3426                 UChar32 c = UTEXT_NEXT32(fInputText);
3427                 if (c == 0x0a) {
3428                     // End of line in normal mode.   '.' does not match the \n
3429                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3430                 } else {
3431                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3432                 }
3433             }
3434             break;
3435 
3436 
3437         case URX_JMP:
3438             fp->fPatIdx = opValue;
3439             break;
3440 
3441         case URX_FAIL:
3442             isMatch = false;
3443             goto breakFromLoop;
3444 
3445         case URX_JMP_SAV:
3446             U_ASSERT(opValue < fPattern->fCompiledPat->size());
3447             fp = StateSave(fp, fp->fPatIdx, status);       // State save to loc following current
3448             fp->fPatIdx = opValue;                         // Then JMP.
3449             break;
3450 
3451         case URX_JMP_SAV_X:
3452             // This opcode is used with (x)+, when x can match a zero length string.
3453             // Same as JMP_SAV, except conditional on the match having made forward progress.
3454             // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the
3455             //   data address of the input position at the start of the loop.
3456             {
3457                 U_ASSERT(opValue > 0 && opValue < fPattern->fCompiledPat->size());
3458                 int32_t  stoOp = (int32_t)pat[opValue-1];
3459                 U_ASSERT(URX_TYPE(stoOp) == URX_STO_INP_LOC);
3460                 int32_t  frameLoc = URX_VAL(stoOp);
3461                 U_ASSERT(frameLoc >= 0 && frameLoc < fFrameSize);
3462                 int64_t prevInputIdx = fp->fExtra[frameLoc];
3463                 U_ASSERT(prevInputIdx <= fp->fInputIdx);
3464                 if (prevInputIdx < fp->fInputIdx) {
3465                     // The match did make progress.  Repeat the loop.
3466                     fp = StateSave(fp, fp->fPatIdx, status);  // State save to loc following current
3467                     fp->fPatIdx = opValue;
3468                     fp->fExtra[frameLoc] = fp->fInputIdx;
3469                 }
3470                 // If the input position did not advance, we do nothing here,
3471                 //   execution will fall out of the loop.
3472             }
3473             break;
3474 
3475         case URX_CTR_INIT:
3476             {
3477                 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
3478                 fp->fExtra[opValue] = 0;                 //  Set the loop counter variable to zero
3479 
3480                 // Pick up the three extra operands that CTR_INIT has, and
3481                 //    skip the pattern location counter past
3482                 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
3483                 fp->fPatIdx += 3;
3484                 int32_t loopLoc  = URX_VAL(pat[instrOperandLoc]);
3485                 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
3486                 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
3487                 U_ASSERT(minCount>=0);
3488                 U_ASSERT(maxCount>=minCount || maxCount==-1);
3489                 U_ASSERT(loopLoc>=fp->fPatIdx);
3490 
3491                 if (minCount == 0) {
3492                     fp = StateSave(fp, loopLoc+1, status);
3493                 }
3494                 if (maxCount == -1) {
3495                     fp->fExtra[opValue+1] = fp->fInputIdx;   //  For loop breaking.
3496                 } else if (maxCount == 0) {
3497                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3498                 }
3499             }
3500             break;
3501 
3502         case URX_CTR_LOOP:
3503             {
3504                 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
3505                 int32_t initOp = (int32_t)pat[opValue];
3506                 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT);
3507                 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
3508                 int32_t minCount  = (int32_t)pat[opValue+2];
3509                 int32_t maxCount  = (int32_t)pat[opValue+3];
3510                 (*pCounter)++;
3511                 if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) {
3512                     U_ASSERT(*pCounter == maxCount);
3513                     break;
3514                 }
3515                 if (*pCounter >= minCount) {
3516                     if (maxCount == -1) {
3517                         // Loop has no hard upper bound.
3518                         // Check that it is progressing through the input, break if it is not.
3519                         int64_t *pLastInputIdx =  &fp->fExtra[URX_VAL(initOp) + 1];
3520                         if (fp->fInputIdx == *pLastInputIdx) {
3521                             break;
3522                         } else {
3523                             *pLastInputIdx = fp->fInputIdx;
3524                         }
3525                     }
3526                     fp = StateSave(fp, fp->fPatIdx, status);
3527                 } else {
3528                     // Increment time-out counter. (StateSave() does it if count >= minCount)
3529                     fTickCounter--;
3530                     if (fTickCounter <= 0) {
3531                         IncrementTime(status);    // Re-initializes fTickCounter
3532                     }
3533                 }
3534 
3535                 fp->fPatIdx = opValue + 4;    // Loop back.
3536             }
3537             break;
3538 
3539         case URX_CTR_INIT_NG:
3540             {
3541                 // Initialize a non-greedy loop
3542                 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
3543                 fp->fExtra[opValue] = 0;                 //  Set the loop counter variable to zero
3544 
3545                 // Pick up the three extra operands that CTR_INIT_NG has, and
3546                 //    skip the pattern location counter past
3547                 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
3548                 fp->fPatIdx += 3;
3549                 int32_t loopLoc  = URX_VAL(pat[instrOperandLoc]);
3550                 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
3551                 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
3552                 U_ASSERT(minCount>=0);
3553                 U_ASSERT(maxCount>=minCount || maxCount==-1);
3554                 U_ASSERT(loopLoc>fp->fPatIdx);
3555                 if (maxCount == -1) {
3556                     fp->fExtra[opValue+1] = fp->fInputIdx;   //  Save initial input index for loop breaking.
3557                 }
3558 
3559                 if (minCount == 0) {
3560                     if (maxCount != 0) {
3561                         fp = StateSave(fp, fp->fPatIdx, status);
3562                     }
3563                     fp->fPatIdx = loopLoc+1;   // Continue with stuff after repeated block
3564                 }
3565             }
3566             break;
3567 
3568         case URX_CTR_LOOP_NG:
3569             {
3570                 // Non-greedy {min, max} loops
3571                 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
3572                 int32_t initOp = (int32_t)pat[opValue];
3573                 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT_NG);
3574                 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
3575                 int32_t minCount  = (int32_t)pat[opValue+2];
3576                 int32_t maxCount  = (int32_t)pat[opValue+3];
3577 
3578                 (*pCounter)++;
3579                 if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) {
3580                     // The loop has matched the maximum permitted number of times.
3581                     //   Break out of here with no action.  Matching will
3582                     //   continue with the following pattern.
3583                     U_ASSERT(*pCounter == maxCount);
3584                     break;
3585                 }
3586 
3587                 if (*pCounter < minCount) {
3588                     // We haven't met the minimum number of matches yet.
3589                     //   Loop back for another one.
3590                     fp->fPatIdx = opValue + 4;    // Loop back.
3591                     // Increment time-out counter. (StateSave() does it if count >= minCount)
3592                     fTickCounter--;
3593                     if (fTickCounter <= 0) {
3594                         IncrementTime(status);    // Re-initializes fTickCounter
3595                     }
3596                 } else {
3597                     // We do have the minimum number of matches.
3598 
3599                     // If there is no upper bound on the loop iterations, check that the input index
3600                     // is progressing, and stop the loop if it is not.
3601                     if (maxCount == -1) {
3602                         int64_t *pLastInputIdx =  &fp->fExtra[URX_VAL(initOp) + 1];
3603                         if (fp->fInputIdx == *pLastInputIdx) {
3604                             break;
3605                         }
3606                         *pLastInputIdx = fp->fInputIdx;
3607                     }
3608 
3609                     // Loop Continuation: we will fall into the pattern following the loop
3610                     //   (non-greedy, don't execute loop body first), but first do
3611                     //   a state save to the top of the loop, so that a match failure
3612                     //   in the following pattern will try another iteration of the loop.
3613                     fp = StateSave(fp, opValue + 4, status);
3614                 }
3615             }
3616             break;
3617 
3618         case URX_STO_SP:
3619             U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
3620             fData[opValue] = fStack->size();
3621             break;
3622 
3623         case URX_LD_SP:
3624             {
3625                 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
3626                 int32_t newStackSize = (int32_t)fData[opValue];
3627                 U_ASSERT(newStackSize <= fStack->size());
3628                 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
3629                 if (newFP == (int64_t *)fp) {
3630                     break;
3631                 }
3632                 int32_t j;
3633                 for (j=0; j<fFrameSize; j++) {
3634                     newFP[j] = ((int64_t *)fp)[j];
3635                 }
3636                 fp = (REStackFrame *)newFP;
3637                 fStack->setSize(newStackSize);
3638             }
3639             break;
3640 
3641         case URX_BACKREF:
3642             {
3643                 U_ASSERT(opValue < fFrameSize);
3644                 int64_t groupStartIdx = fp->fExtra[opValue];
3645                 int64_t groupEndIdx   = fp->fExtra[opValue+1];
3646                 U_ASSERT(groupStartIdx <= groupEndIdx);
3647                 if (groupStartIdx < 0) {
3648                     // This capture group has not participated in the match thus far,
3649                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no match.
3650                     break;
3651                 }
3652                 UTEXT_SETNATIVEINDEX(fAltInputText, groupStartIdx);
3653                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3654 
3655                 //   Note: if the capture group match was of an empty string the backref
3656                 //         match succeeds.  Verified by testing:  Perl matches succeed
3657                 //         in this case, so we do too.
3658 
3659                 UBool success = true;
3660                 for (;;) {
3661                     if (utext_getNativeIndex(fAltInputText) >= groupEndIdx) {
3662                         success = true;
3663                         break;
3664                     }
3665                     if (utext_getNativeIndex(fInputText) >= fActiveLimit) {
3666                         success = false;
3667                         fHitEnd = true;
3668                         break;
3669                     }
3670                     UChar32 captureGroupChar = utext_next32(fAltInputText);
3671                     UChar32 inputChar = utext_next32(fInputText);
3672                     if (inputChar != captureGroupChar) {
3673                         success = false;
3674                         break;
3675                     }
3676                 }
3677 
3678                 if (success) {
3679                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3680                 } else {
3681                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3682                 }
3683             }
3684             break;
3685 
3686 
3687 
3688         case URX_BACKREF_I:
3689             {
3690                 U_ASSERT(opValue < fFrameSize);
3691                 int64_t groupStartIdx = fp->fExtra[opValue];
3692                 int64_t groupEndIdx   = fp->fExtra[opValue+1];
3693                 U_ASSERT(groupStartIdx <= groupEndIdx);
3694                 if (groupStartIdx < 0) {
3695                     // This capture group has not participated in the match thus far,
3696                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no match.
3697                     break;
3698                 }
3699                 utext_setNativeIndex(fAltInputText, groupStartIdx);
3700                 utext_setNativeIndex(fInputText, fp->fInputIdx);
3701                 CaseFoldingUTextIterator captureGroupItr(*fAltInputText);
3702                 CaseFoldingUTextIterator inputItr(*fInputText);
3703 
3704                 //   Note: if the capture group match was of an empty string the backref
3705                 //         match succeeds.  Verified by testing:  Perl matches succeed
3706                 //         in this case, so we do too.
3707 
3708                 UBool success = true;
3709                 for (;;) {
3710                     if (!captureGroupItr.inExpansion() && utext_getNativeIndex(fAltInputText) >= groupEndIdx) {
3711                         success = true;
3712                         break;
3713                     }
3714                     if (!inputItr.inExpansion() && utext_getNativeIndex(fInputText) >= fActiveLimit) {
3715                         success = false;
3716                         fHitEnd = true;
3717                         break;
3718                     }
3719                     UChar32 captureGroupChar = captureGroupItr.next();
3720                     UChar32 inputChar = inputItr.next();
3721                     if (inputChar != captureGroupChar) {
3722                         success = false;
3723                         break;
3724                     }
3725                 }
3726 
3727                 if (success && inputItr.inExpansion()) {
3728                     // We obtained a match by consuming part of a string obtained from
3729                     // case-folding a single code point of the input text.
3730                     // This does not count as an overall match.
3731                     success = false;
3732                 }
3733 
3734                 if (success) {
3735                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3736                 } else {
3737                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3738                 }
3739 
3740             }
3741             break;
3742 
3743         case URX_STO_INP_LOC:
3744             {
3745                 U_ASSERT(opValue >= 0 && opValue < fFrameSize);
3746                 fp->fExtra[opValue] = fp->fInputIdx;
3747             }
3748             break;
3749 
3750         case URX_JMPX:
3751             {
3752                 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
3753                 fp->fPatIdx += 1;
3754                 int32_t dataLoc  = URX_VAL(pat[instrOperandLoc]);
3755                 U_ASSERT(dataLoc >= 0 && dataLoc < fFrameSize);
3756                 int64_t savedInputIdx = fp->fExtra[dataLoc];
3757                 U_ASSERT(savedInputIdx <= fp->fInputIdx);
3758                 if (savedInputIdx < fp->fInputIdx) {
3759                     fp->fPatIdx = opValue;                               // JMP
3760                 } else {
3761                      fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no progress in loop.
3762                 }
3763             }
3764             break;
3765 
3766         case URX_LA_START:
3767             {
3768                 // Entering a look around block.
3769                 // Save Stack Ptr, Input Pos.
3770                 U_ASSERT(opValue>=0 && opValue+3<fPattern->fDataSize);
3771                 fData[opValue]   = fStack->size();
3772                 fData[opValue+1] = fp->fInputIdx;
3773                 fData[opValue+2] = fActiveStart;
3774                 fData[opValue+3] = fActiveLimit;
3775                 fActiveStart     = fLookStart;          // Set the match region change for
3776                 fActiveLimit     = fLookLimit;          //   transparent bounds.
3777             }
3778             break;
3779 
3780         case URX_LA_END:
3781             {
3782                 // Leaving a look-ahead block.
3783                 //  restore Stack Ptr, Input Pos to positions they had on entry to block.
3784                 U_ASSERT(opValue>=0 && opValue+3<fPattern->fDataSize);
3785                 int32_t stackSize = fStack->size();
3786                 int32_t newStackSize =(int32_t)fData[opValue];
3787                 U_ASSERT(stackSize >= newStackSize);
3788                 if (stackSize > newStackSize) {
3789                     // Copy the current top frame back to the new (cut back) top frame.
3790                     //   This makes the capture groups from within the look-ahead
3791                     //   expression available.
3792                     int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
3793                     int32_t j;
3794                     for (j=0; j<fFrameSize; j++) {
3795                         newFP[j] = ((int64_t *)fp)[j];
3796                     }
3797                     fp = (REStackFrame *)newFP;
3798                     fStack->setSize(newStackSize);
3799                 }
3800                 fp->fInputIdx = fData[opValue+1];
3801 
3802                 // Restore the active region bounds in the input string; they may have
3803                 //    been changed because of transparent bounds on a Region.
3804                 fActiveStart = fData[opValue+2];
3805                 fActiveLimit = fData[opValue+3];
3806                 U_ASSERT(fActiveStart >= 0);
3807                 U_ASSERT(fActiveLimit <= fInputLength);
3808             }
3809             break;
3810 
3811         case URX_ONECHAR_I:
3812             // Case insensitive one char.  The char from the pattern is already case folded.
3813             // Input text is not, but case folding the input can not reduce two or more code
3814             // points to one.
3815             if (fp->fInputIdx < fActiveLimit) {
3816                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3817 
3818                 UChar32 c = UTEXT_NEXT32(fInputText);
3819                 if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) {
3820                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3821                     break;
3822                 }
3823             } else {
3824                 fHitEnd = true;
3825             }
3826 
3827             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3828             break;
3829 
3830         case URX_STRING_I:
3831             {
3832                 // Case-insensitive test input against a literal string.
3833                 // Strings require two slots in the compiled pattern, one for the
3834                 //   offset to the string text, and one for the length.
3835                 //   The compiled string has already been case folded.
3836                 {
3837                     const char16_t *patternString = litText + opValue;
3838                     int32_t      patternStringIdx  = 0;
3839 
3840                     op      = (int32_t)pat[fp->fPatIdx];
3841                     fp->fPatIdx++;
3842                     opType  = URX_TYPE(op);
3843                     opValue = URX_VAL(op);
3844                     U_ASSERT(opType == URX_STRING_LEN);
3845                     int32_t patternStringLen = opValue;  // Length of the string from the pattern.
3846 
3847 
3848                     UChar32   cPattern;
3849                     UChar32   cText;
3850                     UBool     success = true;
3851 
3852                     UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3853                     CaseFoldingUTextIterator inputIterator(*fInputText);
3854                     while (patternStringIdx < patternStringLen) {
3855                         if (!inputIterator.inExpansion() && UTEXT_GETNATIVEINDEX(fInputText) >= fActiveLimit) {
3856                             success = false;
3857                             fHitEnd = true;
3858                             break;
3859                         }
3860                         U16_NEXT(patternString, patternStringIdx, patternStringLen, cPattern);
3861                         cText = inputIterator.next();
3862                         if (cText != cPattern) {
3863                             success = false;
3864                             break;
3865                         }
3866                     }
3867                     if (inputIterator.inExpansion()) {
3868                         success = false;
3869                     }
3870 
3871                     if (success) {
3872                         fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3873                     } else {
3874                         fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3875                     }
3876                 }
3877             }
3878             break;
3879 
3880         case URX_LB_START:
3881             {
3882                 // Entering a look-behind block.
3883                 // Save Stack Ptr, Input Pos and active input region.
3884                 //   TODO:  implement transparent bounds.  Ticket #6067
3885                 U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize);
3886                 fData[opValue]   = fStack->size();
3887                 fData[opValue+1] = fp->fInputIdx;
3888                 // Save input string length, then reset to pin any matches to end at
3889                 //   the current position.
3890                 fData[opValue+2] = fActiveStart;
3891                 fData[opValue+3] = fActiveLimit;
3892                 fActiveStart     = fRegionStart;
3893                 fActiveLimit     = fp->fInputIdx;
3894                 // Init the variable containing the start index for attempted matches.
3895                 fData[opValue+4] = -1;
3896             }
3897             break;
3898 
3899 
3900         case URX_LB_CONT:
3901             {
3902                 // Positive Look-Behind, at top of loop checking for matches of LB expression
3903                 //    at all possible input starting positions.
3904 
3905                 // Fetch the min and max possible match lengths.  They are the operands
3906                 //   of this op in the pattern.
3907                 int32_t minML = (int32_t)pat[fp->fPatIdx++];
3908                 int32_t maxML = (int32_t)pat[fp->fPatIdx++];
3909                 if (!UTEXT_USES_U16(fInputText)) {
3910                     // utf-8 fix to maximum match length. The pattern compiler assumes utf-16.
3911                     // The max length need not be exact; it just needs to be >= actual maximum.
3912                     maxML *= 3;
3913                 }
3914                 U_ASSERT(minML <= maxML);
3915                 U_ASSERT(minML >= 0);
3916 
3917                 // Fetch (from data) the last input index where a match was attempted.
3918                 U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize);
3919                 int64_t  &lbStartIdx = fData[opValue+4];
3920                 if (lbStartIdx < 0) {
3921                     // First time through loop.
3922                     lbStartIdx = fp->fInputIdx - minML;
3923                     if (lbStartIdx > 0) {
3924                         // move index to a code point boundary, if it's not on one already.
3925                         UTEXT_SETNATIVEINDEX(fInputText, lbStartIdx);
3926                         lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
3927                     }
3928                 } else {
3929                     // 2nd through nth time through the loop.
3930                     // Back up start position for match by one.
3931                     if (lbStartIdx == 0) {
3932                         (lbStartIdx)--;
3933                     } else {
3934                         UTEXT_SETNATIVEINDEX(fInputText, lbStartIdx);
3935                         (void)UTEXT_PREVIOUS32(fInputText);
3936                         lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
3937                     }
3938                 }
3939 
3940                 if (lbStartIdx < 0 || lbStartIdx < fp->fInputIdx - maxML) {
3941                     // We have tried all potential match starting points without
3942                     //  getting a match.  Backtrack out, and out of the
3943                     //   Look Behind altogether.
3944                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3945                     fActiveStart = fData[opValue+2];
3946                     fActiveLimit = fData[opValue+3];
3947                     U_ASSERT(fActiveStart >= 0);
3948                     U_ASSERT(fActiveLimit <= fInputLength);
3949                     break;
3950                 }
3951 
3952                 //    Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
3953                 //      (successful match will fall off the end of the loop.)
3954                 fp = StateSave(fp, fp->fPatIdx-3, status);
3955                 fp->fInputIdx = lbStartIdx;
3956             }
3957             break;
3958 
3959         case URX_LB_END:
3960             // End of a look-behind block, after a successful match.
3961             {
3962                 U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize);
3963                 if (fp->fInputIdx != fActiveLimit) {
3964                     //  The look-behind expression matched, but the match did not
3965                     //    extend all the way to the point that we are looking behind from.
3966                     //  FAIL out of here, which will take us back to the LB_CONT, which
3967                     //     will retry the match starting at another position or fail
3968                     //     the look-behind altogether, whichever is appropriate.
3969                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3970                     break;
3971                 }
3972 
3973                 // Look-behind match is good.  Restore the original input string region,
3974                 //   which had been truncated to pin the end of the lookbehind match to the
3975                 //   position being looked-behind.
3976                 fActiveStart = fData[opValue+2];
3977                 fActiveLimit = fData[opValue+3];
3978                 U_ASSERT(fActiveStart >= 0);
3979                 U_ASSERT(fActiveLimit <= fInputLength);
3980             }
3981             break;
3982 
3983 
3984         case URX_LBN_CONT:
3985             {
3986                 // Negative Look-Behind, at top of loop checking for matches of LB expression
3987                 //    at all possible input starting positions.
3988 
3989                 // Fetch the extra parameters of this op.
3990                 int32_t minML       = (int32_t)pat[fp->fPatIdx++];
3991                 int32_t maxML       = (int32_t)pat[fp->fPatIdx++];
3992                 if (!UTEXT_USES_U16(fInputText)) {
3993                     // utf-8 fix to maximum match length. The pattern compiler assumes utf-16.
3994                     // The max length need not be exact; it just needs to be >= actual maximum.
3995                     maxML *= 3;
3996                 }
3997                 int32_t continueLoc = (int32_t)pat[fp->fPatIdx++];
3998                         continueLoc = URX_VAL(continueLoc);
3999                 U_ASSERT(minML <= maxML);
4000                 U_ASSERT(minML >= 0);
4001                 U_ASSERT(continueLoc > fp->fPatIdx);
4002 
4003                 // Fetch (from data) the last input index where a match was attempted.
4004                 U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize);
4005                 int64_t  &lbStartIdx = fData[opValue+4];
4006                 if (lbStartIdx < 0) {
4007                     // First time through loop.
4008                     lbStartIdx = fp->fInputIdx - minML;
4009                     if (lbStartIdx > 0) {
4010                         // move index to a code point boundary, if it's not on one already.
4011                         UTEXT_SETNATIVEINDEX(fInputText, lbStartIdx);
4012                         lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
4013                     }
4014                 } else {
4015                     // 2nd through nth time through the loop.
4016                     // Back up start position for match by one.
4017                     if (lbStartIdx == 0) {
4018                         (lbStartIdx)--;
4019                     } else {
4020                         UTEXT_SETNATIVEINDEX(fInputText, lbStartIdx);
4021                         (void)UTEXT_PREVIOUS32(fInputText);
4022                         lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
4023                     }
4024                 }
4025 
4026                 if (lbStartIdx < 0 || lbStartIdx < fp->fInputIdx - maxML) {
4027                     // We have tried all potential match starting points without
4028                     //  getting a match, which means that the negative lookbehind as
4029                     //  a whole has succeeded.  Jump forward to the continue location
4030                     fActiveStart = fData[opValue+2];
4031                     fActiveLimit = fData[opValue+3];
4032                     U_ASSERT(fActiveStart >= 0);
4033                     U_ASSERT(fActiveLimit <= fInputLength);
4034                     fp->fPatIdx = continueLoc;
4035                     break;
4036                 }
4037 
4038                 //    Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
4039                 //      (successful match will cause a FAIL out of the loop altogether.)
4040                 fp = StateSave(fp, fp->fPatIdx-4, status);
4041                 fp->fInputIdx = lbStartIdx;
4042             }
4043             break;
4044 
4045         case URX_LBN_END:
4046             // End of a negative look-behind block, after a successful match.
4047             {
4048                 U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize);
4049                 if (fp->fInputIdx != fActiveLimit) {
4050                     //  The look-behind expression matched, but the match did not
4051                     //    extend all the way to the point that we are looking behind from.
4052                     //  FAIL out of here, which will take us back to the LB_CONT, which
4053                     //     will retry the match starting at another position or succeed
4054                     //     the look-behind altogether, whichever is appropriate.
4055                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4056                     break;
4057                 }
4058 
4059                 // Look-behind expression matched, which means look-behind test as
4060                 //   a whole Fails
4061 
4062                 //   Restore the original input string length, which had been truncated
4063                 //   inorder to pin the end of the lookbehind match
4064                 //   to the position being looked-behind.
4065                 fActiveStart = fData[opValue+2];
4066                 fActiveLimit = fData[opValue+3];
4067                 U_ASSERT(fActiveStart >= 0);
4068                 U_ASSERT(fActiveLimit <= fInputLength);
4069 
4070                 // Restore original stack position, discarding any state saved
4071                 //   by the successful pattern match.
4072                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
4073                 int32_t newStackSize = (int32_t)fData[opValue];
4074                 U_ASSERT(fStack->size() > newStackSize);
4075                 fStack->setSize(newStackSize);
4076 
4077                 //  FAIL, which will take control back to someplace
4078                 //  prior to entering the look-behind test.
4079                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4080             }
4081             break;
4082 
4083 
4084         case URX_LOOP_SR_I:
4085             // Loop Initialization for the optimized implementation of
4086             //     [some character set]*
4087             //   This op scans through all matching input.
4088             //   The following LOOP_C op emulates stack unwinding if the following pattern fails.
4089             {
4090                 U_ASSERT(opValue > 0 && opValue < fSets->size());
4091                 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
4092                 UnicodeSet   *s  = (UnicodeSet *)fSets->elementAt(opValue);
4093 
4094                 // Loop through input, until either the input is exhausted or
4095                 //   we reach a character that is not a member of the set.
4096                 int64_t ix = fp->fInputIdx;
4097                 UTEXT_SETNATIVEINDEX(fInputText, ix);
4098                 for (;;) {
4099                     if (ix >= fActiveLimit) {
4100                         fHitEnd = true;
4101                         break;
4102                     }
4103                     UChar32 c = UTEXT_NEXT32(fInputText);
4104                     if (c<256) {
4105                         if (s8->contains(c) == false) {
4106                             break;
4107                         }
4108                     } else {
4109                         if (s->contains(c) == false) {
4110                             break;
4111                         }
4112                     }
4113                     ix = UTEXT_GETNATIVEINDEX(fInputText);
4114                 }
4115 
4116                 // If there were no matching characters, skip over the loop altogether.
4117                 //   The loop doesn't run at all, a * op always succeeds.
4118                 if (ix == fp->fInputIdx) {
4119                     fp->fPatIdx++;   // skip the URX_LOOP_C op.
4120                     break;
4121                 }
4122 
4123                 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
4124                 //   must follow.  It's operand is the stack location
4125                 //   that holds the starting input index for the match of this [set]*
4126                 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
4127                 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
4128                 int32_t stackLoc = URX_VAL(loopcOp);
4129                 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
4130                 fp->fExtra[stackLoc] = fp->fInputIdx;
4131                 fp->fInputIdx = ix;
4132 
4133                 // Save State to the URX_LOOP_C op that follows this one,
4134                 //   so that match failures in the following code will return to there.
4135                 //   Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
4136                 fp = StateSave(fp, fp->fPatIdx, status);
4137                 fp->fPatIdx++;
4138             }
4139             break;
4140 
4141 
4142         case URX_LOOP_DOT_I:
4143             // Loop Initialization for the optimized implementation of .*
4144             //   This op scans through all remaining input.
4145             //   The following LOOP_C op emulates stack unwinding if the following pattern fails.
4146             {
4147                 // Loop through input until the input is exhausted (we reach an end-of-line)
4148                 // In DOTALL mode, we can just go straight to the end of the input.
4149                 int64_t ix;
4150                 if ((opValue & 1) == 1) {
4151                     // Dot-matches-All mode.  Jump straight to the end of the string.
4152                     ix = fActiveLimit;
4153                     fHitEnd = true;
4154                 } else {
4155                     // NOT DOT ALL mode.  Line endings do not match '.'
4156                     // Scan forward until a line ending or end of input.
4157                     ix = fp->fInputIdx;
4158                     UTEXT_SETNATIVEINDEX(fInputText, ix);
4159                     for (;;) {
4160                         if (ix >= fActiveLimit) {
4161                             fHitEnd = true;
4162                             break;
4163                         }
4164                         UChar32 c = UTEXT_NEXT32(fInputText);
4165                         if ((c & 0x7f) <= 0x29) {          // Fast filter of non-new-line-s
4166                             if ((c == 0x0a) ||             //  0x0a is newline in both modes.
4167                                (((opValue & 2) == 0) &&    // IF not UNIX_LINES mode
4168                                     isLineTerminator(c))) {
4169                                 //  char is a line ending.  Exit the scanning loop.
4170                                 break;
4171                             }
4172                         }
4173                         ix = UTEXT_GETNATIVEINDEX(fInputText);
4174                     }
4175                 }
4176 
4177                 // If there were no matching characters, skip over the loop altogether.
4178                 //   The loop doesn't run at all, a * op always succeeds.
4179                 if (ix == fp->fInputIdx) {
4180                     fp->fPatIdx++;   // skip the URX_LOOP_C op.
4181                     break;
4182                 }
4183 
4184                 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
4185                 //   must follow.  It's operand is the stack location
4186                 //   that holds the starting input index for the match of this .*
4187                 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
4188                 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
4189                 int32_t stackLoc = URX_VAL(loopcOp);
4190                 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
4191                 fp->fExtra[stackLoc] = fp->fInputIdx;
4192                 fp->fInputIdx = ix;
4193 
4194                 // Save State to the URX_LOOP_C op that follows this one,
4195                 //   so that match failures in the following code will return to there.
4196                 //   Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
4197                 fp = StateSave(fp, fp->fPatIdx, status);
4198                 fp->fPatIdx++;
4199             }
4200             break;
4201 
4202 
4203         case URX_LOOP_C:
4204             {
4205                 U_ASSERT(opValue>=0 && opValue<fFrameSize);
4206                 backSearchIndex = fp->fExtra[opValue];
4207                 U_ASSERT(backSearchIndex <= fp->fInputIdx);
4208                 if (backSearchIndex == fp->fInputIdx) {
4209                     // We've backed up the input idx to the point that the loop started.
4210                     // The loop is done.  Leave here without saving state.
4211                     //  Subsequent failures won't come back here.
4212                     break;
4213                 }
4214                 // Set up for the next iteration of the loop, with input index
4215                 //   backed up by one from the last time through,
4216                 //   and a state save to this instruction in case the following code fails again.
4217                 //   (We're going backwards because this loop emulates stack unwinding, not
4218                 //    the initial scan forward.)
4219                 U_ASSERT(fp->fInputIdx > 0);
4220                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
4221                 UChar32 prevC = UTEXT_PREVIOUS32(fInputText);
4222                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
4223 
4224                 UChar32 twoPrevC = UTEXT_PREVIOUS32(fInputText);
4225                 if (prevC == 0x0a &&
4226                     fp->fInputIdx > backSearchIndex &&
4227                     twoPrevC == 0x0d) {
4228                     int32_t prevOp = (int32_t)pat[fp->fPatIdx-2];
4229                     if (URX_TYPE(prevOp) == URX_LOOP_DOT_I) {
4230                         // .*, stepping back over CRLF pair.
4231                         fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
4232                     }
4233                 }
4234 
4235 
4236                 fp = StateSave(fp, fp->fPatIdx-1, status);
4237             }
4238             break;
4239 
4240 
4241 
4242         default:
4243             // Trouble.  The compiled pattern contains an entry with an
4244             //           unrecognized type tag.
4245             UPRV_UNREACHABLE_ASSERT;
4246             // Unknown opcode type in opType = URX_TYPE(pat[fp->fPatIdx]). But we have
4247             // reports of this in production code, don't use UPRV_UNREACHABLE_EXIT.
4248             // See ICU-21669.
4249             status = U_INTERNAL_PROGRAM_ERROR;
4250         }
4251 
4252         if (U_FAILURE(status)) {
4253             isMatch = false;
4254             break;
4255         }
4256     }
4257 
4258 breakFromLoop:
4259     fMatch = isMatch;
4260     if (isMatch) {
4261         fLastMatchEnd = fMatchEnd;
4262         fMatchStart   = startIdx;
4263         fMatchEnd     = fp->fInputIdx;
4264     }
4265 
4266 #ifdef REGEX_RUN_DEBUG
4267     if (fTraceDebug) {
4268         if (isMatch) {
4269             printf("Match.  start=%ld   end=%ld\n\n", fMatchStart, fMatchEnd);
4270         } else {
4271             printf("No match\n\n");
4272         }
4273     }
4274 #endif
4275 
4276     fFrame = fp;                // The active stack frame when the engine stopped.
4277                                 //   Contains the capture group results that we need to
4278                                 //    access later.
4279 }
4280 
4281 
4282 //--------------------------------------------------------------------------------
4283 //
4284 //   MatchChunkAt   This is the actual matching engine. Like MatchAt, but with the
4285 //                  assumption that the entire string is available in the UText's
4286 //                  chunk buffer. For now, that means we can use int32_t indexes,
4287 //                  except for anything that needs to be saved (like group starts
4288 //                  and ends).
4289 //
4290 //                  startIdx:    begin matching a this index.
4291 //                  toEnd:       if true, match must extend to end of the input region
4292 //
4293 //--------------------------------------------------------------------------------
MatchChunkAt(int32_t startIdx,UBool toEnd,UErrorCode & status)4294 void RegexMatcher::MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &status) {
4295     UBool       isMatch  = false;      // True if the we have a match.
4296 
4297     int32_t     backSearchIndex = INT32_MAX; // used after greedy single-character matches for searching backwards
4298 
4299     int32_t     op;                    // Operation from the compiled pattern, split into
4300     int32_t     opType;                //    the opcode
4301     int32_t     opValue;               //    and the operand value.
4302 
4303 #ifdef REGEX_RUN_DEBUG
4304     if (fTraceDebug) {
4305         printf("MatchAt(startIdx=%d)\n", startIdx);
4306         printf("Original Pattern: \"%s\"\n", CStr(StringFromUText(fPattern->fPattern))());
4307         printf("Input String:     \"%s\"\n\n", CStr(StringFromUText(fInputText))());
4308     }
4309 #endif
4310 
4311     if (U_FAILURE(status)) {
4312         return;
4313     }
4314 
4315     //  Cache frequently referenced items from the compiled pattern
4316     //
4317     int64_t             *pat           = fPattern->fCompiledPat->getBuffer();
4318 
4319     const char16_t      *litText       = fPattern->fLiteralText.getBuffer();
4320     UVector             *fSets         = fPattern->fSets;
4321 
4322     const char16_t      *inputBuf      = fInputText->chunkContents;
4323 
4324     fFrameSize = fPattern->fFrameSize;
4325     REStackFrame        *fp            = resetStack();
4326     if (U_FAILURE(fDeferredStatus)) {
4327         status = fDeferredStatus;
4328         return;
4329     }
4330 
4331     fp->fPatIdx   = 0;
4332     fp->fInputIdx = startIdx;
4333 
4334     // Zero out the pattern's static data
4335     int32_t i;
4336     for (i = 0; i<fPattern->fDataSize; i++) {
4337         fData[i] = 0;
4338     }
4339 
4340     //
4341     //  Main loop for interpreting the compiled pattern.
4342     //  One iteration of the loop per pattern operation performed.
4343     //
4344     for (;;) {
4345         op      = (int32_t)pat[fp->fPatIdx];
4346         opType  = URX_TYPE(op);
4347         opValue = URX_VAL(op);
4348 #ifdef REGEX_RUN_DEBUG
4349         if (fTraceDebug) {
4350             UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
4351             printf("inputIdx=%ld   inputChar=%x   sp=%3ld   activeLimit=%ld  ", fp->fInputIdx,
4352                    UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(), fActiveLimit);
4353             fPattern->dumpOp(fp->fPatIdx);
4354         }
4355 #endif
4356         fp->fPatIdx++;
4357 
4358         switch (opType) {
4359 
4360 
4361         case URX_NOP:
4362             break;
4363 
4364 
4365         case URX_BACKTRACK:
4366             // Force a backtrack.  In some circumstances, the pattern compiler
4367             //   will notice that the pattern can't possibly match anything, and will
4368             //   emit one of these at that point.
4369             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4370             break;
4371 
4372 
4373         case URX_ONECHAR:
4374             if (fp->fInputIdx < fActiveLimit) {
4375                 UChar32 c;
4376                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4377                 if (c == opValue) {
4378                     break;
4379                 }
4380             } else {
4381                 fHitEnd = true;
4382             }
4383             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4384             break;
4385 
4386 
4387         case URX_STRING:
4388             {
4389                 // Test input against a literal string.
4390                 // Strings require two slots in the compiled pattern, one for the
4391                 //   offset to the string text, and one for the length.
4392                 int32_t   stringStartIdx = opValue;
4393                 int32_t   stringLen;
4394 
4395                 op      = (int32_t)pat[fp->fPatIdx];     // Fetch the second operand
4396                 fp->fPatIdx++;
4397                 opType    = URX_TYPE(op);
4398                 stringLen = URX_VAL(op);
4399                 U_ASSERT(opType == URX_STRING_LEN);
4400                 U_ASSERT(stringLen >= 2);
4401 
4402                 const char16_t * pInp = inputBuf + fp->fInputIdx;
4403                 const char16_t * pInpLimit = inputBuf + fActiveLimit;
4404                 const char16_t * pPat = litText+stringStartIdx;
4405                 const char16_t * pEnd = pInp + stringLen;
4406                 UBool success = true;
4407                 while (pInp < pEnd) {
4408                     if (pInp >= pInpLimit) {
4409                         fHitEnd = true;
4410                         success = false;
4411                         break;
4412                     }
4413                     if (*pInp++ != *pPat++) {
4414                         success = false;
4415                         break;
4416                     }
4417                 }
4418 
4419                 if (success) {
4420                     fp->fInputIdx += stringLen;
4421                 } else {
4422                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4423                 }
4424             }
4425             break;
4426 
4427 
4428         case URX_STATE_SAVE:
4429             fp = StateSave(fp, opValue, status);
4430             break;
4431 
4432 
4433         case URX_END:
4434             // The match loop will exit via this path on a successful match,
4435             //   when we reach the end of the pattern.
4436             if (toEnd && fp->fInputIdx != fActiveLimit) {
4437                 // The pattern matched, but not to the end of input.  Try some more.
4438                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4439                 break;
4440             }
4441             isMatch = true;
4442             goto  breakFromLoop;
4443 
4444             // Start and End Capture stack frame variables are laid out out like this:
4445             //  fp->fExtra[opValue]  - The start of a completed capture group
4446             //             opValue+1 - The end   of a completed capture group
4447             //             opValue+2 - the start of a capture group whose end
4448             //                          has not yet been reached (and might not ever be).
4449         case URX_START_CAPTURE:
4450             U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
4451             fp->fExtra[opValue+2] = fp->fInputIdx;
4452             break;
4453 
4454 
4455         case URX_END_CAPTURE:
4456             U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
4457             U_ASSERT(fp->fExtra[opValue+2] >= 0);            // Start pos for this group must be set.
4458             fp->fExtra[opValue]   = fp->fExtra[opValue+2];   // Tentative start becomes real.
4459             fp->fExtra[opValue+1] = fp->fInputIdx;           // End position
4460             U_ASSERT(fp->fExtra[opValue] <= fp->fExtra[opValue+1]);
4461             break;
4462 
4463 
4464         case URX_DOLLAR:                   //  $, test for End of line
4465             //     or for position before new line at end of input
4466             if (fp->fInputIdx < fAnchorLimit-2) {
4467                 // We are no where near the end of input.  Fail.
4468                 //   This is the common case.  Keep it first.
4469                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4470                 break;
4471             }
4472             if (fp->fInputIdx >= fAnchorLimit) {
4473                 // We really are at the end of input.  Success.
4474                 fHitEnd = true;
4475                 fRequireEnd = true;
4476                 break;
4477             }
4478 
4479             // If we are positioned just before a new-line that is located at the
4480             //   end of input, succeed.
4481             if (fp->fInputIdx == fAnchorLimit-1) {
4482                 UChar32 c;
4483                 U16_GET(inputBuf, fAnchorStart, fp->fInputIdx, fAnchorLimit, c);
4484 
4485                 if (isLineTerminator(c)) {
4486                     if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp->fInputIdx-1]==0x0d)) {
4487                         // At new-line at end of input. Success
4488                         fHitEnd = true;
4489                         fRequireEnd = true;
4490                         break;
4491                     }
4492                 }
4493             } else if (fp->fInputIdx == fAnchorLimit-2 &&
4494                 inputBuf[fp->fInputIdx]==0x0d && inputBuf[fp->fInputIdx+1]==0x0a) {
4495                     fHitEnd = true;
4496                     fRequireEnd = true;
4497                     break;                         // At CR/LF at end of input.  Success
4498             }
4499 
4500             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4501 
4502             break;
4503 
4504 
4505         case URX_DOLLAR_D:                   //  $, test for End of Line, in UNIX_LINES mode.
4506             if (fp->fInputIdx >= fAnchorLimit-1) {
4507                 // Either at the last character of input, or off the end.
4508                 if (fp->fInputIdx == fAnchorLimit-1) {
4509                     // At last char of input.  Success if it's a new line.
4510                     if (inputBuf[fp->fInputIdx] == 0x0a) {
4511                         fHitEnd = true;
4512                         fRequireEnd = true;
4513                         break;
4514                     }
4515                 } else {
4516                     // Off the end of input.  Success.
4517                     fHitEnd = true;
4518                     fRequireEnd = true;
4519                     break;
4520                 }
4521             }
4522 
4523             // Not at end of input.  Back-track out.
4524             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4525             break;
4526 
4527 
4528         case URX_DOLLAR_M:                //  $, test for End of line in multi-line mode
4529             {
4530                 if (fp->fInputIdx >= fAnchorLimit) {
4531                     // We really are at the end of input.  Success.
4532                     fHitEnd = true;
4533                     fRequireEnd = true;
4534                     break;
4535                 }
4536                 // If we are positioned just before a new-line, succeed.
4537                 // It makes no difference where the new-line is within the input.
4538                 UChar32 c = inputBuf[fp->fInputIdx];
4539                 if (isLineTerminator(c)) {
4540                     // At a line end, except for the odd chance of  being in the middle of a CR/LF sequence
4541                     //  In multi-line mode, hitting a new-line just before the end of input does not
4542                     //   set the hitEnd or requireEnd flags
4543                     if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp->fInputIdx-1]==0x0d)) {
4544                         break;
4545                     }
4546                 }
4547                 // not at a new line.  Fail.
4548                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4549             }
4550             break;
4551 
4552 
4553         case URX_DOLLAR_MD:                //  $, test for End of line in multi-line and UNIX_LINES mode
4554             {
4555                 if (fp->fInputIdx >= fAnchorLimit) {
4556                     // We really are at the end of input.  Success.
4557                     fHitEnd = true;
4558                     fRequireEnd = true;  // Java set requireEnd in this case, even though
4559                     break;               //   adding a new-line would not lose the match.
4560                 }
4561                 // If we are not positioned just before a new-line, the test fails; backtrack out.
4562                 // It makes no difference where the new-line is within the input.
4563                 if (inputBuf[fp->fInputIdx] != 0x0a) {
4564                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4565                 }
4566             }
4567             break;
4568 
4569 
4570         case URX_CARET:                    //  ^, test for start of line
4571             if (fp->fInputIdx != fAnchorStart) {
4572                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4573             }
4574             break;
4575 
4576 
4577         case URX_CARET_M:                   //  ^, test for start of line in mulit-line mode
4578             {
4579                 if (fp->fInputIdx == fAnchorStart) {
4580                     // We are at the start input.  Success.
4581                     break;
4582                 }
4583                 // Check whether character just before the current pos is a new-line
4584                 //   unless we are at the end of input
4585                 char16_t  c = inputBuf[fp->fInputIdx - 1];
4586                 if ((fp->fInputIdx < fAnchorLimit) &&
4587                     isLineTerminator(c)) {
4588                     //  It's a new-line.  ^ is true.  Success.
4589                     //  TODO:  what should be done with positions between a CR and LF?
4590                     break;
4591                 }
4592                 // Not at the start of a line.  Fail.
4593                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4594             }
4595             break;
4596 
4597 
4598         case URX_CARET_M_UNIX:       //  ^, test for start of line in mulit-line + Unix-line mode
4599             {
4600                 U_ASSERT(fp->fInputIdx >= fAnchorStart);
4601                 if (fp->fInputIdx <= fAnchorStart) {
4602                     // We are at the start input.  Success.
4603                     break;
4604                 }
4605                 // Check whether character just before the current pos is a new-line
4606                 U_ASSERT(fp->fInputIdx <= fAnchorLimit);
4607                 char16_t  c = inputBuf[fp->fInputIdx - 1];
4608                 if (c != 0x0a) {
4609                     // Not at the start of a line.  Back-track out.
4610                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4611                 }
4612             }
4613             break;
4614 
4615         case URX_BACKSLASH_B:          // Test for word boundaries
4616             {
4617                 UBool success = isChunkWordBoundary((int32_t)fp->fInputIdx);
4618                 success ^= (UBool)(opValue != 0);     // flip sense for \B
4619                 if (!success) {
4620                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4621                 }
4622             }
4623             break;
4624 
4625 
4626         case URX_BACKSLASH_BU:          // Test for word boundaries, Unicode-style
4627             {
4628                 UBool success = isUWordBoundary(fp->fInputIdx, status);
4629                 success ^= (UBool)(opValue != 0);     // flip sense for \B
4630                 if (!success) {
4631                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4632                 }
4633             }
4634             break;
4635 
4636 
4637         case URX_BACKSLASH_D:            // Test for decimal digit
4638             {
4639                 if (fp->fInputIdx >= fActiveLimit) {
4640                     fHitEnd = true;
4641                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4642                     break;
4643                 }
4644 
4645                 UChar32 c;
4646                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4647                 int8_t ctype = u_charType(c);     // TODO:  make a unicode set for this.  Will be faster.
4648                 UBool success = (ctype == U_DECIMAL_DIGIT_NUMBER);
4649                 success ^= (UBool)(opValue != 0);        // flip sense for \D
4650                 if (!success) {
4651                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4652                 }
4653             }
4654             break;
4655 
4656 
4657         case URX_BACKSLASH_G:          // Test for position at end of previous match
4658             if (!((fMatch && fp->fInputIdx==fMatchEnd) || (fMatch==false && fp->fInputIdx==fActiveStart))) {
4659                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4660             }
4661             break;
4662 
4663 
4664         case URX_BACKSLASH_H:            // Test for \h, horizontal white space.
4665             {
4666                 if (fp->fInputIdx >= fActiveLimit) {
4667                     fHitEnd = true;
4668                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4669                     break;
4670                 }
4671                 UChar32 c;
4672                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4673                 int8_t ctype = u_charType(c);
4674                 UBool success = (ctype == U_SPACE_SEPARATOR || c == 9);  // SPACE_SEPARATOR || TAB
4675                 success ^= (UBool)(opValue != 0);        // flip sense for \H
4676                 if (!success) {
4677                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4678                 }
4679             }
4680             break;
4681 
4682 
4683         case URX_BACKSLASH_R:            // Test for \R, any line break sequence.
4684             {
4685                 if (fp->fInputIdx >= fActiveLimit) {
4686                     fHitEnd = true;
4687                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4688                     break;
4689                 }
4690                 UChar32 c;
4691                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4692                 if (isLineTerminator(c)) {
4693                     if (c == 0x0d && fp->fInputIdx < fActiveLimit) {
4694                         // Check for CR/LF sequence. Consume both together when found.
4695                         char16_t c2;
4696                         U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c2);
4697                         if (c2 != 0x0a) {
4698                             U16_PREV(inputBuf, 0, fp->fInputIdx, c2);
4699                         }
4700                     }
4701                 } else {
4702                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4703                 }
4704             }
4705             break;
4706 
4707 
4708         case URX_BACKSLASH_V:         // Any single code point line ending.
4709             {
4710                 if (fp->fInputIdx >= fActiveLimit) {
4711                     fHitEnd = true;
4712                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4713                     break;
4714                 }
4715                 UChar32 c;
4716                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4717                 UBool success = isLineTerminator(c);
4718                 success ^= (UBool)(opValue != 0);        // flip sense for \V
4719                 if (!success) {
4720                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4721                 }
4722             }
4723             break;
4724 
4725 
4726         case URX_BACKSLASH_X:
4727             //  Match a Grapheme, as defined by Unicode UAX 29.
4728 
4729             // Fail if at end of input
4730             if (fp->fInputIdx >= fActiveLimit) {
4731                 fHitEnd = true;
4732                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4733                 break;
4734             }
4735 
4736             fp->fInputIdx = followingGCBoundary(fp->fInputIdx, status);
4737             if (fp->fInputIdx >= fActiveLimit) {
4738                 fHitEnd = true;
4739                 fp->fInputIdx = fActiveLimit;
4740             }
4741             break;
4742 
4743 
4744         case URX_BACKSLASH_Z:          // Test for end of Input
4745             if (fp->fInputIdx < fAnchorLimit) {
4746                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4747             } else {
4748                 fHitEnd = true;
4749                 fRequireEnd = true;
4750             }
4751             break;
4752 
4753 
4754 
4755         case URX_STATIC_SETREF:
4756             {
4757                 // Test input character against one of the predefined sets
4758                 //    (Word Characters, for example)
4759                 // The high bit of the op value is a flag for the match polarity.
4760                 //    0:   success if input char is in set.
4761                 //    1:   success if input char is not in set.
4762                 if (fp->fInputIdx >= fActiveLimit) {
4763                     fHitEnd = true;
4764                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4765                     break;
4766                 }
4767 
4768                 UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET);
4769                 opValue &= ~URX_NEG_SET;
4770                 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
4771 
4772                 UChar32 c;
4773                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4774                 if (c < 256) {
4775                     Regex8BitSet &s8 = RegexStaticSets::gStaticSets->fPropSets8[opValue];
4776                     if (s8.contains(c)) {
4777                         success = !success;
4778                     }
4779                 } else {
4780                     const UnicodeSet &s = RegexStaticSets::gStaticSets->fPropSets[opValue];
4781                     if (s.contains(c)) {
4782                         success = !success;
4783                     }
4784                 }
4785                 if (!success) {
4786                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4787                 }
4788             }
4789             break;
4790 
4791 
4792         case URX_STAT_SETREF_N:
4793             {
4794                 // Test input character for NOT being a member of  one of
4795                 //    the predefined sets (Word Characters, for example)
4796                 if (fp->fInputIdx >= fActiveLimit) {
4797                     fHitEnd = true;
4798                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4799                     break;
4800                 }
4801 
4802                 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
4803 
4804                 UChar32  c;
4805                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4806                 if (c < 256) {
4807                     Regex8BitSet &s8 = RegexStaticSets::gStaticSets->fPropSets8[opValue];
4808                     if (s8.contains(c) == false) {
4809                         break;
4810                     }
4811                 } else {
4812                     const UnicodeSet &s = RegexStaticSets::gStaticSets->fPropSets[opValue];
4813                     if (s.contains(c) == false) {
4814                         break;
4815                     }
4816                 }
4817                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4818             }
4819             break;
4820 
4821 
4822         case URX_SETREF:
4823             {
4824                 if (fp->fInputIdx >= fActiveLimit) {
4825                     fHitEnd = true;
4826                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4827                     break;
4828                 }
4829 
4830                 U_ASSERT(opValue > 0 && opValue < fSets->size());
4831 
4832                 // There is input left.  Pick up one char and test it for set membership.
4833                 UChar32  c;
4834                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4835                 if (c<256) {
4836                     Regex8BitSet *s8 = &fPattern->fSets8[opValue];
4837                     if (s8->contains(c)) {
4838                         // The character is in the set.  A Match.
4839                         break;
4840                     }
4841                 } else {
4842                     UnicodeSet *s = (UnicodeSet *)fSets->elementAt(opValue);
4843                     if (s->contains(c)) {
4844                         // The character is in the set.  A Match.
4845                         break;
4846                     }
4847                 }
4848 
4849                 // the character wasn't in the set.
4850                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4851             }
4852             break;
4853 
4854 
4855         case URX_DOTANY:
4856             {
4857                 // . matches anything, but stops at end-of-line.
4858                 if (fp->fInputIdx >= fActiveLimit) {
4859                     // At end of input.  Match failed.  Backtrack out.
4860                     fHitEnd = true;
4861                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4862                     break;
4863                 }
4864 
4865                 // There is input left.  Advance over one char, unless we've hit end-of-line
4866                 UChar32  c;
4867                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4868                 if (isLineTerminator(c)) {
4869                     // End of line in normal mode.   . does not match.
4870                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4871                     break;
4872                 }
4873             }
4874             break;
4875 
4876 
4877         case URX_DOTANY_ALL:
4878             {
4879                 // . in dot-matches-all (including new lines) mode
4880                 if (fp->fInputIdx >= fActiveLimit) {
4881                     // At end of input.  Match failed.  Backtrack out.
4882                     fHitEnd = true;
4883                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4884                     break;
4885                 }
4886 
4887                 // There is input left.  Advance over one char, except if we are
4888                 //   at a cr/lf, advance over both of them.
4889                 UChar32 c;
4890                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4891                 if (c==0x0d && fp->fInputIdx < fActiveLimit) {
4892                     // In the case of a CR/LF, we need to advance over both.
4893                     if (inputBuf[fp->fInputIdx] == 0x0a) {
4894                         U16_FWD_1(inputBuf, fp->fInputIdx, fActiveLimit);
4895                     }
4896                 }
4897             }
4898             break;
4899 
4900 
4901         case URX_DOTANY_UNIX:
4902             {
4903                 // '.' operator, matches all, but stops at end-of-line.
4904                 //   UNIX_LINES mode, so 0x0a is the only recognized line ending.
4905                 if (fp->fInputIdx >= fActiveLimit) {
4906                     // At end of input.  Match failed.  Backtrack out.
4907                     fHitEnd = true;
4908                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4909                     break;
4910                 }
4911 
4912                 // There is input left.  Advance over one char, unless we've hit end-of-line
4913                 UChar32 c;
4914                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4915                 if (c == 0x0a) {
4916                     // End of line in normal mode.   '.' does not match the \n
4917                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4918                 }
4919             }
4920             break;
4921 
4922 
4923         case URX_JMP:
4924             fp->fPatIdx = opValue;
4925             break;
4926 
4927         case URX_FAIL:
4928             isMatch = false;
4929             goto breakFromLoop;
4930 
4931         case URX_JMP_SAV:
4932             U_ASSERT(opValue < fPattern->fCompiledPat->size());
4933             fp = StateSave(fp, fp->fPatIdx, status);       // State save to loc following current
4934             fp->fPatIdx = opValue;                         // Then JMP.
4935             break;
4936 
4937         case URX_JMP_SAV_X:
4938             // This opcode is used with (x)+, when x can match a zero length string.
4939             // Same as JMP_SAV, except conditional on the match having made forward progress.
4940             // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the
4941             //   data address of the input position at the start of the loop.
4942             {
4943                 U_ASSERT(opValue > 0 && opValue < fPattern->fCompiledPat->size());
4944                 int32_t  stoOp = (int32_t)pat[opValue-1];
4945                 U_ASSERT(URX_TYPE(stoOp) == URX_STO_INP_LOC);
4946                 int32_t  frameLoc = URX_VAL(stoOp);
4947                 U_ASSERT(frameLoc >= 0 && frameLoc < fFrameSize);
4948                 int32_t prevInputIdx = (int32_t)fp->fExtra[frameLoc];
4949                 U_ASSERT(prevInputIdx <= fp->fInputIdx);
4950                 if (prevInputIdx < fp->fInputIdx) {
4951                     // The match did make progress.  Repeat the loop.
4952                     fp = StateSave(fp, fp->fPatIdx, status);  // State save to loc following current
4953                     fp->fPatIdx = opValue;
4954                     fp->fExtra[frameLoc] = fp->fInputIdx;
4955                 }
4956                 // If the input position did not advance, we do nothing here,
4957                 //   execution will fall out of the loop.
4958             }
4959             break;
4960 
4961         case URX_CTR_INIT:
4962             {
4963                 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
4964                 fp->fExtra[opValue] = 0;                 //  Set the loop counter variable to zero
4965 
4966                 // Pick up the three extra operands that CTR_INIT has, and
4967                 //    skip the pattern location counter past
4968                 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
4969                 fp->fPatIdx += 3;
4970                 int32_t loopLoc  = URX_VAL(pat[instrOperandLoc]);
4971                 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
4972                 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
4973                 U_ASSERT(minCount>=0);
4974                 U_ASSERT(maxCount>=minCount || maxCount==-1);
4975                 U_ASSERT(loopLoc>=fp->fPatIdx);
4976 
4977                 if (minCount == 0) {
4978                     fp = StateSave(fp, loopLoc+1, status);
4979                 }
4980                 if (maxCount == -1) {
4981                     fp->fExtra[opValue+1] = fp->fInputIdx;   //  For loop breaking.
4982                 } else if (maxCount == 0) {
4983                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4984                 }
4985             }
4986             break;
4987 
4988         case URX_CTR_LOOP:
4989             {
4990                 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
4991                 int32_t initOp = (int32_t)pat[opValue];
4992                 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT);
4993                 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
4994                 int32_t minCount  = (int32_t)pat[opValue+2];
4995                 int32_t maxCount  = (int32_t)pat[opValue+3];
4996                 (*pCounter)++;
4997                 if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) {
4998                     U_ASSERT(*pCounter == maxCount);
4999                     break;
5000                 }
5001                 if (*pCounter >= minCount) {
5002                     if (maxCount == -1) {
5003                         // Loop has no hard upper bound.
5004                         // Check that it is progressing through the input, break if it is not.
5005                         int64_t *pLastInputIdx =  &fp->fExtra[URX_VAL(initOp) + 1];
5006                         if (fp->fInputIdx == *pLastInputIdx) {
5007                             break;
5008                         } else {
5009                             *pLastInputIdx = fp->fInputIdx;
5010                         }
5011                     }
5012                     fp = StateSave(fp, fp->fPatIdx, status);
5013                 } else {
5014                     // Increment time-out counter. (StateSave() does it if count >= minCount)
5015                     fTickCounter--;
5016                     if (fTickCounter <= 0) {
5017                         IncrementTime(status);    // Re-initializes fTickCounter
5018                     }
5019                 }
5020                 fp->fPatIdx = opValue + 4;    // Loop back.
5021             }
5022             break;
5023 
5024         case URX_CTR_INIT_NG:
5025             {
5026                 // Initialize a non-greedy loop
5027                 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
5028                 fp->fExtra[opValue] = 0;                 //  Set the loop counter variable to zero
5029 
5030                 // Pick up the three extra operands that CTR_INIT_NG has, and
5031                 //    skip the pattern location counter past
5032                 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
5033                 fp->fPatIdx += 3;
5034                 int32_t loopLoc  = URX_VAL(pat[instrOperandLoc]);
5035                 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
5036                 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
5037                 U_ASSERT(minCount>=0);
5038                 U_ASSERT(maxCount>=minCount || maxCount==-1);
5039                 U_ASSERT(loopLoc>fp->fPatIdx);
5040                 if (maxCount == -1) {
5041                     fp->fExtra[opValue+1] = fp->fInputIdx;   //  Save initial input index for loop breaking.
5042                 }
5043 
5044                 if (minCount == 0) {
5045                     if (maxCount != 0) {
5046                         fp = StateSave(fp, fp->fPatIdx, status);
5047                     }
5048                     fp->fPatIdx = loopLoc+1;   // Continue with stuff after repeated block
5049                 }
5050             }
5051             break;
5052 
5053         case URX_CTR_LOOP_NG:
5054             {
5055                 // Non-greedy {min, max} loops
5056                 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
5057                 int32_t initOp = (int32_t)pat[opValue];
5058                 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT_NG);
5059                 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
5060                 int32_t minCount  = (int32_t)pat[opValue+2];
5061                 int32_t maxCount  = (int32_t)pat[opValue+3];
5062 
5063                 (*pCounter)++;
5064                 if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) {
5065                     // The loop has matched the maximum permitted number of times.
5066                     //   Break out of here with no action.  Matching will
5067                     //   continue with the following pattern.
5068                     U_ASSERT(*pCounter == maxCount);
5069                     break;
5070                 }
5071 
5072                 if (*pCounter < minCount) {
5073                     // We haven't met the minimum number of matches yet.
5074                     //   Loop back for another one.
5075                     fp->fPatIdx = opValue + 4;    // Loop back.
5076                     fTickCounter--;
5077                     if (fTickCounter <= 0) {
5078                         IncrementTime(status);    // Re-initializes fTickCounter
5079                     }
5080                 } else {
5081                     // We do have the minimum number of matches.
5082 
5083                     // If there is no upper bound on the loop iterations, check that the input index
5084                     // is progressing, and stop the loop if it is not.
5085                     if (maxCount == -1) {
5086                         int64_t *pLastInputIdx =  &fp->fExtra[URX_VAL(initOp) + 1];
5087                         if (fp->fInputIdx == *pLastInputIdx) {
5088                             break;
5089                         }
5090                         *pLastInputIdx = fp->fInputIdx;
5091                     }
5092 
5093                     // Loop Continuation: we will fall into the pattern following the loop
5094                     //   (non-greedy, don't execute loop body first), but first do
5095                     //   a state save to the top of the loop, so that a match failure
5096                     //   in the following pattern will try another iteration of the loop.
5097                     fp = StateSave(fp, opValue + 4, status);
5098                 }
5099             }
5100             break;
5101 
5102         case URX_STO_SP:
5103             U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
5104             fData[opValue] = fStack->size();
5105             break;
5106 
5107         case URX_LD_SP:
5108             {
5109                 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
5110                 int32_t newStackSize = (int32_t)fData[opValue];
5111                 U_ASSERT(newStackSize <= fStack->size());
5112                 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
5113                 if (newFP == (int64_t *)fp) {
5114                     break;
5115                 }
5116                 int32_t j;
5117                 for (j=0; j<fFrameSize; j++) {
5118                     newFP[j] = ((int64_t *)fp)[j];
5119                 }
5120                 fp = (REStackFrame *)newFP;
5121                 fStack->setSize(newStackSize);
5122             }
5123             break;
5124 
5125         case URX_BACKREF:
5126             {
5127                 U_ASSERT(opValue < fFrameSize);
5128                 int64_t groupStartIdx = fp->fExtra[opValue];
5129                 int64_t groupEndIdx   = fp->fExtra[opValue+1];
5130                 U_ASSERT(groupStartIdx <= groupEndIdx);
5131                 int64_t inputIndex = fp->fInputIdx;
5132                 if (groupStartIdx < 0) {
5133                     // This capture group has not participated in the match thus far,
5134                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no match.
5135                     break;
5136                 }
5137                 UBool success = true;
5138                 for (int64_t groupIndex = groupStartIdx; groupIndex < groupEndIdx; ++groupIndex,++inputIndex) {
5139                     if (inputIndex >= fActiveLimit) {
5140                         success = false;
5141                         fHitEnd = true;
5142                         break;
5143                     }
5144                     if (inputBuf[groupIndex] != inputBuf[inputIndex]) {
5145                         success = false;
5146                         break;
5147                     }
5148                 }
5149                 if (success && groupStartIdx < groupEndIdx && U16_IS_LEAD(inputBuf[groupEndIdx-1]) &&
5150                         inputIndex < fActiveLimit && U16_IS_TRAIL(inputBuf[inputIndex])) {
5151                     // Capture group ended with an unpaired lead surrogate.
5152                     // Back reference is not permitted to match lead only of a surrogatge pair.
5153                     success = false;
5154                 }
5155                 if (success) {
5156                     fp->fInputIdx = inputIndex;
5157                 } else {
5158                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5159                 }
5160             }
5161             break;
5162 
5163         case URX_BACKREF_I:
5164             {
5165                 U_ASSERT(opValue < fFrameSize);
5166                 int64_t groupStartIdx = fp->fExtra[opValue];
5167                 int64_t groupEndIdx   = fp->fExtra[opValue+1];
5168                 U_ASSERT(groupStartIdx <= groupEndIdx);
5169                 if (groupStartIdx < 0) {
5170                     // This capture group has not participated in the match thus far,
5171                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no match.
5172                     break;
5173                 }
5174                 CaseFoldingUCharIterator captureGroupItr(inputBuf, groupStartIdx, groupEndIdx);
5175                 CaseFoldingUCharIterator inputItr(inputBuf, fp->fInputIdx, fActiveLimit);
5176 
5177                 //   Note: if the capture group match was of an empty string the backref
5178                 //         match succeeds.  Verified by testing:  Perl matches succeed
5179                 //         in this case, so we do too.
5180 
5181                 UBool success = true;
5182                 for (;;) {
5183                     UChar32 captureGroupChar = captureGroupItr.next();
5184                     if (captureGroupChar == U_SENTINEL) {
5185                         success = true;
5186                         break;
5187                     }
5188                     UChar32 inputChar = inputItr.next();
5189                     if (inputChar == U_SENTINEL) {
5190                         success = false;
5191                         fHitEnd = true;
5192                         break;
5193                     }
5194                     if (inputChar != captureGroupChar) {
5195                         success = false;
5196                         break;
5197                     }
5198                 }
5199 
5200                 if (success && inputItr.inExpansion()) {
5201                     // We obtained a match by consuming part of a string obtained from
5202                     // case-folding a single code point of the input text.
5203                     // This does not count as an overall match.
5204                     success = false;
5205                 }
5206 
5207                 if (success) {
5208                     fp->fInputIdx = inputItr.getIndex();
5209                 } else {
5210                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5211                 }
5212             }
5213             break;
5214 
5215         case URX_STO_INP_LOC:
5216             {
5217                 U_ASSERT(opValue >= 0 && opValue < fFrameSize);
5218                 fp->fExtra[opValue] = fp->fInputIdx;
5219             }
5220             break;
5221 
5222         case URX_JMPX:
5223             {
5224                 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
5225                 fp->fPatIdx += 1;
5226                 int32_t dataLoc  = URX_VAL(pat[instrOperandLoc]);
5227                 U_ASSERT(dataLoc >= 0 && dataLoc < fFrameSize);
5228                 int32_t savedInputIdx = (int32_t)fp->fExtra[dataLoc];
5229                 U_ASSERT(savedInputIdx <= fp->fInputIdx);
5230                 if (savedInputIdx < fp->fInputIdx) {
5231                     fp->fPatIdx = opValue;                               // JMP
5232                 } else {
5233                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no progress in loop.
5234                 }
5235             }
5236             break;
5237 
5238         case URX_LA_START:
5239             {
5240                 // Entering a look around block.
5241                 // Save Stack Ptr, Input Pos.
5242                 U_ASSERT(opValue>=0 && opValue+3<fPattern->fDataSize);
5243                 fData[opValue]   = fStack->size();
5244                 fData[opValue+1] = fp->fInputIdx;
5245                 fData[opValue+2] = fActiveStart;
5246                 fData[opValue+3] = fActiveLimit;
5247                 fActiveStart     = fLookStart;          // Set the match region change for
5248                 fActiveLimit     = fLookLimit;          //   transparent bounds.
5249             }
5250             break;
5251 
5252         case URX_LA_END:
5253             {
5254                 // Leaving a look around block.
5255                 //  restore Stack Ptr, Input Pos to positions they had on entry to block.
5256                 U_ASSERT(opValue>=0 && opValue+3<fPattern->fDataSize);
5257                 int32_t stackSize = fStack->size();
5258                 int32_t newStackSize = (int32_t)fData[opValue];
5259                 U_ASSERT(stackSize >= newStackSize);
5260                 if (stackSize > newStackSize) {
5261                     // Copy the current top frame back to the new (cut back) top frame.
5262                     //   This makes the capture groups from within the look-ahead
5263                     //   expression available.
5264                     int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
5265                     int32_t j;
5266                     for (j=0; j<fFrameSize; j++) {
5267                         newFP[j] = ((int64_t *)fp)[j];
5268                     }
5269                     fp = (REStackFrame *)newFP;
5270                     fStack->setSize(newStackSize);
5271                 }
5272                 fp->fInputIdx = fData[opValue+1];
5273 
5274                 // Restore the active region bounds in the input string; they may have
5275                 //    been changed because of transparent bounds on a Region.
5276                 fActiveStart = fData[opValue+2];
5277                 fActiveLimit = fData[opValue+3];
5278                 U_ASSERT(fActiveStart >= 0);
5279                 U_ASSERT(fActiveLimit <= fInputLength);
5280             }
5281             break;
5282 
5283         case URX_ONECHAR_I:
5284             if (fp->fInputIdx < fActiveLimit) {
5285                 UChar32 c;
5286                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
5287                 if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) {
5288                     break;
5289                 }
5290             } else {
5291                 fHitEnd = true;
5292             }
5293             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5294             break;
5295 
5296         case URX_STRING_I:
5297             // Case-insensitive test input against a literal string.
5298             // Strings require two slots in the compiled pattern, one for the
5299             //   offset to the string text, and one for the length.
5300             //   The compiled string has already been case folded.
5301             {
5302                 const char16_t *patternString = litText + opValue;
5303 
5304                 op      = (int32_t)pat[fp->fPatIdx];
5305                 fp->fPatIdx++;
5306                 opType  = URX_TYPE(op);
5307                 opValue = URX_VAL(op);
5308                 U_ASSERT(opType == URX_STRING_LEN);
5309                 int32_t patternStringLen = opValue;  // Length of the string from the pattern.
5310 
5311                 UChar32      cText;
5312                 UChar32      cPattern;
5313                 UBool        success = true;
5314                 int32_t      patternStringIdx  = 0;
5315                 CaseFoldingUCharIterator inputIterator(inputBuf, fp->fInputIdx, fActiveLimit);
5316                 while (patternStringIdx < patternStringLen) {
5317                     U16_NEXT(patternString, patternStringIdx, patternStringLen, cPattern);
5318                     cText = inputIterator.next();
5319                     if (cText != cPattern) {
5320                         success = false;
5321                         if (cText == U_SENTINEL) {
5322                             fHitEnd = true;
5323                         }
5324                         break;
5325                     }
5326                 }
5327                 if (inputIterator.inExpansion()) {
5328                     success = false;
5329                 }
5330 
5331                 if (success) {
5332                     fp->fInputIdx = inputIterator.getIndex();
5333                 } else {
5334                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5335                 }
5336             }
5337             break;
5338 
5339         case URX_LB_START:
5340             {
5341                 // Entering a look-behind block.
5342                 // Save Stack Ptr, Input Pos and active input region.
5343                 //   TODO:  implement transparent bounds.  Ticket #6067
5344                 U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize);
5345                 fData[opValue]   = fStack->size();
5346                 fData[opValue+1] = fp->fInputIdx;
5347                 // Save input string length, then reset to pin any matches to end at
5348                 //   the current position.
5349                 fData[opValue+2] = fActiveStart;
5350                 fData[opValue+3] = fActiveLimit;
5351                 fActiveStart     = fRegionStart;
5352                 fActiveLimit     = fp->fInputIdx;
5353                 // Init the variable containing the start index for attempted matches.
5354                 fData[opValue+4] = -1;
5355             }
5356             break;
5357 
5358 
5359         case URX_LB_CONT:
5360             {
5361                 // Positive Look-Behind, at top of loop checking for matches of LB expression
5362                 //    at all possible input starting positions.
5363 
5364                 // Fetch the min and max possible match lengths.  They are the operands
5365                 //   of this op in the pattern.
5366                 int32_t minML = (int32_t)pat[fp->fPatIdx++];
5367                 int32_t maxML = (int32_t)pat[fp->fPatIdx++];
5368                 U_ASSERT(minML <= maxML);
5369                 U_ASSERT(minML >= 0);
5370 
5371                 // Fetch (from data) the last input index where a match was attempted.
5372                 U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize);
5373                 int64_t  &lbStartIdx = fData[opValue+4];
5374                 if (lbStartIdx < 0) {
5375                     // First time through loop.
5376                     lbStartIdx = fp->fInputIdx - minML;
5377                     if (lbStartIdx > 0 && lbStartIdx < fInputLength) {
5378                         U16_SET_CP_START(inputBuf, 0, lbStartIdx);
5379                     }
5380                 } else {
5381                     // 2nd through nth time through the loop.
5382                     // Back up start position for match by one.
5383                     if (lbStartIdx == 0) {
5384                         lbStartIdx--;
5385                     } else {
5386                         U16_BACK_1(inputBuf, 0, lbStartIdx);
5387                     }
5388                 }
5389 
5390                 if (lbStartIdx < 0 || lbStartIdx < fp->fInputIdx - maxML) {
5391                     // We have tried all potential match starting points without
5392                     //  getting a match.  Backtrack out, and out of the
5393                     //   Look Behind altogether.
5394                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5395                     fActiveStart = fData[opValue+2];
5396                     fActiveLimit = fData[opValue+3];
5397                     U_ASSERT(fActiveStart >= 0);
5398                     U_ASSERT(fActiveLimit <= fInputLength);
5399                     break;
5400                 }
5401 
5402                 //    Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
5403                 //      (successful match will fall off the end of the loop.)
5404                 fp = StateSave(fp, fp->fPatIdx-3, status);
5405                 fp->fInputIdx =  lbStartIdx;
5406             }
5407             break;
5408 
5409         case URX_LB_END:
5410             // End of a look-behind block, after a successful match.
5411             {
5412                 U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize);
5413                 if (fp->fInputIdx != fActiveLimit) {
5414                     //  The look-behind expression matched, but the match did not
5415                     //    extend all the way to the point that we are looking behind from.
5416                     //  FAIL out of here, which will take us back to the LB_CONT, which
5417                     //     will retry the match starting at another position or fail
5418                     //     the look-behind altogether, whichever is appropriate.
5419                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5420                     break;
5421                 }
5422 
5423                 // Look-behind match is good.  Restore the original input string region,
5424                 //   which had been truncated to pin the end of the lookbehind match to the
5425                 //   position being looked-behind.
5426                 fActiveStart = fData[opValue+2];
5427                 fActiveLimit = fData[opValue+3];
5428                 U_ASSERT(fActiveStart >= 0);
5429                 U_ASSERT(fActiveLimit <= fInputLength);
5430             }
5431             break;
5432 
5433 
5434         case URX_LBN_CONT:
5435             {
5436                 // Negative Look-Behind, at top of loop checking for matches of LB expression
5437                 //    at all possible input starting positions.
5438 
5439                 // Fetch the extra parameters of this op.
5440                 int32_t minML       = (int32_t)pat[fp->fPatIdx++];
5441                 int32_t maxML       = (int32_t)pat[fp->fPatIdx++];
5442                 int32_t continueLoc = (int32_t)pat[fp->fPatIdx++];
5443                 continueLoc = URX_VAL(continueLoc);
5444                 U_ASSERT(minML <= maxML);
5445                 U_ASSERT(minML >= 0);
5446                 U_ASSERT(continueLoc > fp->fPatIdx);
5447 
5448                 // Fetch (from data) the last input index where a match was attempted.
5449                 U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize);
5450                 int64_t  &lbStartIdx = fData[opValue+4];
5451                 if (lbStartIdx < 0) {
5452                     // First time through loop.
5453                     lbStartIdx = fp->fInputIdx - minML;
5454                     if (lbStartIdx > 0 && lbStartIdx < fInputLength) {
5455                         U16_SET_CP_START(inputBuf, 0, lbStartIdx);
5456                     }
5457                 } else {
5458                     // 2nd through nth time through the loop.
5459                     // Back up start position for match by one.
5460                     if (lbStartIdx == 0) {
5461                         lbStartIdx--;   // Because U16_BACK is unsafe starting at 0.
5462                     } else {
5463                         U16_BACK_1(inputBuf, 0, lbStartIdx);
5464                     }
5465                 }
5466 
5467                 if (lbStartIdx < 0 || lbStartIdx < fp->fInputIdx - maxML) {
5468                     // We have tried all potential match starting points without
5469                     //  getting a match, which means that the negative lookbehind as
5470                     //  a whole has succeeded.  Jump forward to the continue location
5471                     fActiveStart = fData[opValue+2];
5472                     fActiveLimit = fData[opValue+3];
5473                     U_ASSERT(fActiveStart >= 0);
5474                     U_ASSERT(fActiveLimit <= fInputLength);
5475                     fp->fPatIdx = continueLoc;
5476                     break;
5477                 }
5478 
5479                 //    Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
5480                 //      (successful match will cause a FAIL out of the loop altogether.)
5481                 fp = StateSave(fp, fp->fPatIdx-4, status);
5482                 fp->fInputIdx =  lbStartIdx;
5483             }
5484             break;
5485 
5486         case URX_LBN_END:
5487             // End of a negative look-behind block, after a successful match.
5488             {
5489                 U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize);
5490                 if (fp->fInputIdx != fActiveLimit) {
5491                     //  The look-behind expression matched, but the match did not
5492                     //    extend all the way to the point that we are looking behind from.
5493                     //  FAIL out of here, which will take us back to the LB_CONT, which
5494                     //     will retry the match starting at another position or succeed
5495                     //     the look-behind altogether, whichever is appropriate.
5496                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5497                     break;
5498                 }
5499 
5500                 // Look-behind expression matched, which means look-behind test as
5501                 //   a whole Fails
5502 
5503                 //   Restore the original input string length, which had been truncated
5504                 //   inorder to pin the end of the lookbehind match
5505                 //   to the position being looked-behind.
5506                 fActiveStart = fData[opValue+2];
5507                 fActiveLimit = fData[opValue+3];
5508                 U_ASSERT(fActiveStart >= 0);
5509                 U_ASSERT(fActiveLimit <= fInputLength);
5510 
5511                 // Restore original stack position, discarding any state saved
5512                 //   by the successful pattern match.
5513                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5514                 int32_t newStackSize = (int32_t)fData[opValue];
5515                 U_ASSERT(fStack->size() > newStackSize);
5516                 fStack->setSize(newStackSize);
5517 
5518                 //  FAIL, which will take control back to someplace
5519                 //  prior to entering the look-behind test.
5520                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5521             }
5522             break;
5523 
5524 
5525         case URX_LOOP_SR_I:
5526             // Loop Initialization for the optimized implementation of
5527             //     [some character set]*
5528             //   This op scans through all matching input.
5529             //   The following LOOP_C op emulates stack unwinding if the following pattern fails.
5530             {
5531                 U_ASSERT(opValue > 0 && opValue < fSets->size());
5532                 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
5533                 UnicodeSet   *s  = (UnicodeSet *)fSets->elementAt(opValue);
5534 
5535                 // Loop through input, until either the input is exhausted or
5536                 //   we reach a character that is not a member of the set.
5537                 int32_t ix = (int32_t)fp->fInputIdx;
5538                 for (;;) {
5539                     if (ix >= fActiveLimit) {
5540                         fHitEnd = true;
5541                         break;
5542                     }
5543                     UChar32   c;
5544                     U16_NEXT(inputBuf, ix, fActiveLimit, c);
5545                     if (c<256) {
5546                         if (s8->contains(c) == false) {
5547                             U16_BACK_1(inputBuf, 0, ix);
5548                             break;
5549                         }
5550                     } else {
5551                         if (s->contains(c) == false) {
5552                             U16_BACK_1(inputBuf, 0, ix);
5553                             break;
5554                         }
5555                     }
5556                 }
5557 
5558                 // If there were no matching characters, skip over the loop altogether.
5559                 //   The loop doesn't run at all, a * op always succeeds.
5560                 if (ix == fp->fInputIdx) {
5561                     fp->fPatIdx++;   // skip the URX_LOOP_C op.
5562                     break;
5563                 }
5564 
5565                 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
5566                 //   must follow.  It's operand is the stack location
5567                 //   that holds the starting input index for the match of this [set]*
5568                 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
5569                 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
5570                 int32_t stackLoc = URX_VAL(loopcOp);
5571                 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
5572                 fp->fExtra[stackLoc] = fp->fInputIdx;
5573                 fp->fInputIdx = ix;
5574 
5575                 // Save State to the URX_LOOP_C op that follows this one,
5576                 //   so that match failures in the following code will return to there.
5577                 //   Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
5578                 fp = StateSave(fp, fp->fPatIdx, status);
5579                 fp->fPatIdx++;
5580             }
5581             break;
5582 
5583 
5584         case URX_LOOP_DOT_I:
5585             // Loop Initialization for the optimized implementation of .*
5586             //   This op scans through all remaining input.
5587             //   The following LOOP_C op emulates stack unwinding if the following pattern fails.
5588             {
5589                 // Loop through input until the input is exhausted (we reach an end-of-line)
5590                 // In DOTALL mode, we can just go straight to the end of the input.
5591                 int32_t ix;
5592                 if ((opValue & 1) == 1) {
5593                     // Dot-matches-All mode.  Jump straight to the end of the string.
5594                     ix = (int32_t)fActiveLimit;
5595                     fHitEnd = true;
5596                 } else {
5597                     // NOT DOT ALL mode.  Line endings do not match '.'
5598                     // Scan forward until a line ending or end of input.
5599                     ix = (int32_t)fp->fInputIdx;
5600                     for (;;) {
5601                         if (ix >= fActiveLimit) {
5602                             fHitEnd = true;
5603                             break;
5604                         }
5605                         UChar32   c;
5606                         U16_NEXT(inputBuf, ix, fActiveLimit, c);   // c = inputBuf[ix++]
5607                         if ((c & 0x7f) <= 0x29) {          // Fast filter of non-new-line-s
5608                             if ((c == 0x0a) ||             //  0x0a is newline in both modes.
5609                                 (((opValue & 2) == 0) &&    // IF not UNIX_LINES mode
5610                                    isLineTerminator(c))) {
5611                                 //  char is a line ending.  Put the input pos back to the
5612                                 //    line ending char, and exit the scanning loop.
5613                                 U16_BACK_1(inputBuf, 0, ix);
5614                                 break;
5615                             }
5616                         }
5617                     }
5618                 }
5619 
5620                 // If there were no matching characters, skip over the loop altogether.
5621                 //   The loop doesn't run at all, a * op always succeeds.
5622                 if (ix == fp->fInputIdx) {
5623                     fp->fPatIdx++;   // skip the URX_LOOP_C op.
5624                     break;
5625                 }
5626 
5627                 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
5628                 //   must follow.  It's operand is the stack location
5629                 //   that holds the starting input index for the match of this .*
5630                 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
5631                 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
5632                 int32_t stackLoc = URX_VAL(loopcOp);
5633                 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
5634                 fp->fExtra[stackLoc] = fp->fInputIdx;
5635                 fp->fInputIdx = ix;
5636 
5637                 // Save State to the URX_LOOP_C op that follows this one,
5638                 //   so that match failures in the following code will return to there.
5639                 //   Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
5640                 fp = StateSave(fp, fp->fPatIdx, status);
5641                 fp->fPatIdx++;
5642             }
5643             break;
5644 
5645 
5646         case URX_LOOP_C:
5647             {
5648                 U_ASSERT(opValue>=0 && opValue<fFrameSize);
5649                 backSearchIndex = (int32_t)fp->fExtra[opValue];
5650                 U_ASSERT(backSearchIndex <= fp->fInputIdx);
5651                 if (backSearchIndex == fp->fInputIdx) {
5652                     // We've backed up the input idx to the point that the loop started.
5653                     // The loop is done.  Leave here without saving state.
5654                     //  Subsequent failures won't come back here.
5655                     break;
5656                 }
5657                 // Set up for the next iteration of the loop, with input index
5658                 //   backed up by one from the last time through,
5659                 //   and a state save to this instruction in case the following code fails again.
5660                 //   (We're going backwards because this loop emulates stack unwinding, not
5661                 //    the initial scan forward.)
5662                 U_ASSERT(fp->fInputIdx > 0);
5663                 UChar32 prevC;
5664                 U16_PREV(inputBuf, 0, fp->fInputIdx, prevC); // !!!: should this 0 be one of f*Limit?
5665 
5666                 if (prevC == 0x0a &&
5667                     fp->fInputIdx > backSearchIndex &&
5668                     inputBuf[fp->fInputIdx-1] == 0x0d) {
5669                     int32_t prevOp = (int32_t)pat[fp->fPatIdx-2];
5670                     if (URX_TYPE(prevOp) == URX_LOOP_DOT_I) {
5671                         // .*, stepping back over CRLF pair.
5672                         U16_BACK_1(inputBuf, 0, fp->fInputIdx);
5673                     }
5674                 }
5675 
5676 
5677                 fp = StateSave(fp, fp->fPatIdx-1, status);
5678             }
5679             break;
5680 
5681 
5682 
5683         default:
5684             // Trouble.  The compiled pattern contains an entry with an
5685             //           unrecognized type tag.
5686             UPRV_UNREACHABLE_ASSERT;
5687             // Unknown opcode type in opType = URX_TYPE(pat[fp->fPatIdx]). But we have
5688             // reports of this in production code, don't use UPRV_UNREACHABLE_EXIT.
5689             // See ICU-21669.
5690             status = U_INTERNAL_PROGRAM_ERROR;
5691         }
5692 
5693         if (U_FAILURE(status)) {
5694             isMatch = false;
5695             break;
5696         }
5697     }
5698 
5699 breakFromLoop:
5700     fMatch = isMatch;
5701     if (isMatch) {
5702         fLastMatchEnd = fMatchEnd;
5703         fMatchStart   = startIdx;
5704         fMatchEnd     = fp->fInputIdx;
5705     }
5706 
5707 #ifdef REGEX_RUN_DEBUG
5708     if (fTraceDebug) {
5709         if (isMatch) {
5710             printf("Match.  start=%ld   end=%ld\n\n", fMatchStart, fMatchEnd);
5711         } else {
5712             printf("No match\n\n");
5713         }
5714     }
5715 #endif
5716 
5717     fFrame = fp;                // The active stack frame when the engine stopped.
5718                                 //   Contains the capture group results that we need to
5719                                 //    access later.
5720 }
5721 
5722 
5723 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RegexMatcher)
5724 
5725 U_NAMESPACE_END
5726 
5727 #endif  // !UCONFIG_NO_REGULAR_EXPRESSIONS
5728 
5729