• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 **************************************************************************
5 *   Copyright (C) 2002-2016 International Business Machines Corporation
6 *   and others. All rights reserved.
7 **************************************************************************
8 */
9 //
10 //  file:  rematch.cpp
11 //
12 //         Contains the implementation of class RegexMatcher,
13 //         which is one of the main API classes for the ICU regular expression package.
14 //
15 
16 #include "unicode/utypes.h"
17 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
18 
19 #include "unicode/regex.h"
20 #include "unicode/uniset.h"
21 #include "unicode/uchar.h"
22 #include "unicode/ustring.h"
23 #include "unicode/rbbi.h"
24 #include "unicode/utf.h"
25 #include "unicode/utf16.h"
26 #include "uassert.h"
27 #include "cmemory.h"
28 #include "cstr.h"
29 #include "uvector.h"
30 #include "uvectr32.h"
31 #include "uvectr64.h"
32 #include "regeximp.h"
33 #include "regexst.h"
34 #include "regextxt.h"
35 #include "ucase.h"
36 
37 // #include <malloc.h>        // Needed for heapcheck testing
38 
39 
40 U_NAMESPACE_BEGIN
41 
42 // Default limit for the size of the back track stack, to avoid system
43 //    failures causedby heap exhaustion.  Units are in 32 bit words, not bytes.
44 // This value puts ICU's limits higher than most other regexp implementations,
45 //    which use recursion rather than the heap, and take more storage per
46 //    backtrack point.
47 //
48 static const int32_t DEFAULT_BACKTRACK_STACK_CAPACITY = 8000000;
49 
50 // Time limit counter constant.
51 //   Time limits for expression evaluation are in terms of quanta of work by
52 //   the engine, each of which is 10,000 state saves.
53 //   This constant determines that state saves per tick number.
54 static const int32_t TIMER_INITIAL_VALUE = 10000;
55 
56 
57 // Test for any of the Unicode line terminating characters.
isLineTerminator(UChar32 c)58 static inline UBool isLineTerminator(UChar32 c) {
59     if (c & ~(0x0a | 0x0b | 0x0c | 0x0d | 0x85 | 0x2028 | 0x2029)) {
60         return false;
61     }
62     return (c<=0x0d && c>=0x0a) || c==0x85 || c==0x2028 || c==0x2029;
63 }
64 
65 //-----------------------------------------------------------------------------
66 //
67 //   Constructor and Destructor
68 //
69 //-----------------------------------------------------------------------------
RegexMatcher(const RegexPattern * pat)70 RegexMatcher::RegexMatcher(const RegexPattern *pat)  {
71     fDeferredStatus = U_ZERO_ERROR;
72     init(fDeferredStatus);
73     if (U_FAILURE(fDeferredStatus)) {
74         return;
75     }
76     if (pat==NULL) {
77         fDeferredStatus = U_ILLEGAL_ARGUMENT_ERROR;
78         return;
79     }
80     fPattern = pat;
81     init2(RegexStaticSets::gStaticSets->fEmptyText, fDeferredStatus);
82 }
83 
84 
85 
RegexMatcher(const UnicodeString & regexp,const UnicodeString & input,uint32_t flags,UErrorCode & status)86 RegexMatcher::RegexMatcher(const UnicodeString &regexp, const UnicodeString &input,
87                            uint32_t flags, UErrorCode &status) {
88     init(status);
89     if (U_FAILURE(status)) {
90         return;
91     }
92     UParseError    pe;
93     fPatternOwned      = RegexPattern::compile(regexp, flags, pe, status);
94     fPattern           = fPatternOwned;
95 
96     UText inputText = UTEXT_INITIALIZER;
97     utext_openConstUnicodeString(&inputText, &input, &status);
98     init2(&inputText, status);
99     utext_close(&inputText);
100 
101     fInputUniStrMaybeMutable = TRUE;
102 }
103 
104 
RegexMatcher(UText * regexp,UText * input,uint32_t flags,UErrorCode & status)105 RegexMatcher::RegexMatcher(UText *regexp, UText *input,
106                            uint32_t flags, UErrorCode &status) {
107     init(status);
108     if (U_FAILURE(status)) {
109         return;
110     }
111     UParseError    pe;
112     fPatternOwned      = RegexPattern::compile(regexp, flags, pe, status);
113     if (U_FAILURE(status)) {
114         return;
115     }
116 
117     fPattern           = fPatternOwned;
118     init2(input, status);
119 }
120 
121 
RegexMatcher(const UnicodeString & regexp,uint32_t flags,UErrorCode & status)122 RegexMatcher::RegexMatcher(const UnicodeString &regexp,
123                            uint32_t flags, UErrorCode &status) {
124     init(status);
125     if (U_FAILURE(status)) {
126         return;
127     }
128     UParseError    pe;
129     fPatternOwned      = RegexPattern::compile(regexp, flags, pe, status);
130     if (U_FAILURE(status)) {
131         return;
132     }
133     fPattern           = fPatternOwned;
134     init2(RegexStaticSets::gStaticSets->fEmptyText, status);
135 }
136 
RegexMatcher(UText * regexp,uint32_t flags,UErrorCode & status)137 RegexMatcher::RegexMatcher(UText *regexp,
138                            uint32_t flags, UErrorCode &status) {
139     init(status);
140     if (U_FAILURE(status)) {
141         return;
142     }
143     UParseError    pe;
144     fPatternOwned      = RegexPattern::compile(regexp, flags, pe, status);
145         if (U_FAILURE(status)) {
146         return;
147     }
148 
149     fPattern           = fPatternOwned;
150     init2(RegexStaticSets::gStaticSets->fEmptyText, status);
151 }
152 
153 
154 
155 
~RegexMatcher()156 RegexMatcher::~RegexMatcher() {
157     delete fStack;
158     if (fData != fSmallData) {
159         uprv_free(fData);
160         fData = NULL;
161     }
162     if (fPatternOwned) {
163         delete fPatternOwned;
164         fPatternOwned = NULL;
165         fPattern = NULL;
166     }
167 
168     if (fInput) {
169         delete fInput;
170     }
171     if (fInputText) {
172         utext_close(fInputText);
173     }
174     if (fAltInputText) {
175         utext_close(fAltInputText);
176     }
177 
178     #if UCONFIG_NO_BREAK_ITERATION==0
179     delete fWordBreakItr;
180     #endif
181 }
182 
183 //
184 //   init()   common initialization for use by all constructors.
185 //            Initialize all fields, get the object into a consistent state.
186 //            This must be done even when the initial status shows an error,
187 //            so that the object is initialized sufficiently well for the destructor
188 //            to run safely.
189 //
init(UErrorCode & status)190 void RegexMatcher::init(UErrorCode &status) {
191     fPattern           = NULL;
192     fPatternOwned      = NULL;
193     fFrameSize         = 0;
194     fRegionStart       = 0;
195     fRegionLimit       = 0;
196     fAnchorStart       = 0;
197     fAnchorLimit       = 0;
198     fLookStart         = 0;
199     fLookLimit         = 0;
200     fActiveStart       = 0;
201     fActiveLimit       = 0;
202     fTransparentBounds = FALSE;
203     fAnchoringBounds   = TRUE;
204     fMatch             = FALSE;
205     fMatchStart        = 0;
206     fMatchEnd          = 0;
207     fLastMatchEnd      = -1;
208     fAppendPosition    = 0;
209     fHitEnd            = FALSE;
210     fRequireEnd        = FALSE;
211     fStack             = NULL;
212     fFrame             = NULL;
213     fTimeLimit         = 0;
214     fTime              = 0;
215     fTickCounter       = 0;
216     fStackLimit        = DEFAULT_BACKTRACK_STACK_CAPACITY;
217     fCallbackFn        = NULL;
218     fCallbackContext   = NULL;
219     fFindProgressCallbackFn      = NULL;
220     fFindProgressCallbackContext = NULL;
221     fTraceDebug        = FALSE;
222     fDeferredStatus    = status;
223     fData              = fSmallData;
224     fWordBreakItr      = NULL;
225 
226     fStack             = NULL;
227     fInputText         = NULL;
228     fAltInputText      = NULL;
229     fInput             = NULL;
230     fInputLength       = 0;
231     fInputUniStrMaybeMutable = FALSE;
232 }
233 
234 //
235 //  init2()   Common initialization for use by RegexMatcher constructors, part 2.
236 //            This handles the common setup to be done after the Pattern is available.
237 //
init2(UText * input,UErrorCode & status)238 void RegexMatcher::init2(UText *input, UErrorCode &status) {
239     if (U_FAILURE(status)) {
240         fDeferredStatus = status;
241         return;
242     }
243 
244     if (fPattern->fDataSize > UPRV_LENGTHOF(fSmallData)) {
245         fData = (int64_t *)uprv_malloc(fPattern->fDataSize * sizeof(int64_t));
246         if (fData == NULL) {
247             status = fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
248             return;
249         }
250     }
251 
252     fStack = new UVector64(status);
253     if (fStack == NULL) {
254         status = fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
255         return;
256     }
257 
258     reset(input);
259     setStackLimit(DEFAULT_BACKTRACK_STACK_CAPACITY, status);
260     if (U_FAILURE(status)) {
261         fDeferredStatus = status;
262         return;
263     }
264 }
265 
266 
267 static const UChar BACKSLASH  = 0x5c;
268 static const UChar DOLLARSIGN = 0x24;
269 static const UChar LEFTBRACKET = 0x7b;
270 static const UChar RIGHTBRACKET = 0x7d;
271 
272 //--------------------------------------------------------------------------------
273 //
274 //    appendReplacement
275 //
276 //--------------------------------------------------------------------------------
appendReplacement(UnicodeString & dest,const UnicodeString & replacement,UErrorCode & status)277 RegexMatcher &RegexMatcher::appendReplacement(UnicodeString &dest,
278                                               const UnicodeString &replacement,
279                                               UErrorCode &status) {
280     UText replacementText = UTEXT_INITIALIZER;
281 
282     utext_openConstUnicodeString(&replacementText, &replacement, &status);
283     if (U_SUCCESS(status)) {
284         UText resultText = UTEXT_INITIALIZER;
285         utext_openUnicodeString(&resultText, &dest, &status);
286 
287         if (U_SUCCESS(status)) {
288             appendReplacement(&resultText, &replacementText, status);
289             utext_close(&resultText);
290         }
291         utext_close(&replacementText);
292     }
293 
294     return *this;
295 }
296 
297 //
298 //    appendReplacement, UText mode
299 //
appendReplacement(UText * dest,UText * replacement,UErrorCode & status)300 RegexMatcher &RegexMatcher::appendReplacement(UText *dest,
301                                               UText *replacement,
302                                               UErrorCode &status) {
303     if (U_FAILURE(status)) {
304         return *this;
305     }
306     if (U_FAILURE(fDeferredStatus)) {
307         status = fDeferredStatus;
308         return *this;
309     }
310     if (fMatch == FALSE) {
311         status = U_REGEX_INVALID_STATE;
312         return *this;
313     }
314 
315     // Copy input string from the end of previous match to start of current match
316     int64_t  destLen = utext_nativeLength(dest);
317     if (fMatchStart > fAppendPosition) {
318         if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
319             destLen += utext_replace(dest, destLen, destLen, fInputText->chunkContents+fAppendPosition,
320                                      (int32_t)(fMatchStart-fAppendPosition), &status);
321         } else {
322             int32_t len16;
323             if (UTEXT_USES_U16(fInputText)) {
324                 len16 = (int32_t)(fMatchStart-fAppendPosition);
325             } else {
326                 UErrorCode lengthStatus = U_ZERO_ERROR;
327                 len16 = utext_extract(fInputText, fAppendPosition, fMatchStart, NULL, 0, &lengthStatus);
328             }
329             UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1));
330             if (inputChars == NULL) {
331                 status = U_MEMORY_ALLOCATION_ERROR;
332                 return *this;
333             }
334             utext_extract(fInputText, fAppendPosition, fMatchStart, inputChars, len16+1, &status);
335             destLen += utext_replace(dest, destLen, destLen, inputChars, len16, &status);
336             uprv_free(inputChars);
337         }
338     }
339     fAppendPosition = fMatchEnd;
340 
341 
342     // scan the replacement text, looking for substitutions ($n) and \escapes.
343     //  TODO:  optimize this loop by efficiently scanning for '$' or '\',
344     //         move entire ranges not containing substitutions.
345     UTEXT_SETNATIVEINDEX(replacement, 0);
346     for (UChar32 c = UTEXT_NEXT32(replacement); U_SUCCESS(status) && c != U_SENTINEL;  c = UTEXT_NEXT32(replacement)) {
347         if (c == BACKSLASH) {
348             // Backslash Escape.  Copy the following char out without further checks.
349             //                    Note:  Surrogate pairs don't need any special handling
350             //                           The second half wont be a '$' or a '\', and
351             //                           will move to the dest normally on the next
352             //                           loop iteration.
353             c = UTEXT_CURRENT32(replacement);
354             if (c == U_SENTINEL) {
355                 break;
356             }
357 
358             if (c==0x55/*U*/ || c==0x75/*u*/) {
359                 // We have a \udddd or \Udddddddd escape sequence.
360                 int32_t offset = 0;
361                 struct URegexUTextUnescapeCharContext context = U_REGEX_UTEXT_UNESCAPE_CONTEXT(replacement);
362                 UChar32 escapedChar = u_unescapeAt(uregex_utext_unescape_charAt, &offset, INT32_MAX, &context);
363                 if (escapedChar != (UChar32)0xFFFFFFFF) {
364                     if (U_IS_BMP(escapedChar)) {
365                         UChar c16 = (UChar)escapedChar;
366                         destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
367                     } else {
368                         UChar surrogate[2];
369                         surrogate[0] = U16_LEAD(escapedChar);
370                         surrogate[1] = U16_TRAIL(escapedChar);
371                         if (U_SUCCESS(status)) {
372                             destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);
373                         }
374                     }
375                     // TODO:  Report errors for mal-formed \u escapes?
376                     //        As this is, the original sequence is output, which may be OK.
377                     if (context.lastOffset == offset) {
378                         (void)UTEXT_PREVIOUS32(replacement);
379                     } else if (context.lastOffset != offset-1) {
380                         utext_moveIndex32(replacement, offset - context.lastOffset - 1);
381                     }
382                 }
383             } else {
384                 (void)UTEXT_NEXT32(replacement);
385                 // Plain backslash escape.  Just put out the escaped character.
386                 if (U_IS_BMP(c)) {
387                     UChar c16 = (UChar)c;
388                     destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
389                 } else {
390                     UChar surrogate[2];
391                     surrogate[0] = U16_LEAD(c);
392                     surrogate[1] = U16_TRAIL(c);
393                     if (U_SUCCESS(status)) {
394                         destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);
395                     }
396                 }
397             }
398         } else if (c != DOLLARSIGN) {
399             // Normal char, not a $.  Copy it out without further checks.
400             if (U_IS_BMP(c)) {
401                 UChar c16 = (UChar)c;
402                 destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
403             } else {
404                 UChar surrogate[2];
405                 surrogate[0] = U16_LEAD(c);
406                 surrogate[1] = U16_TRAIL(c);
407                 if (U_SUCCESS(status)) {
408                     destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);
409                 }
410             }
411         } else {
412             // We've got a $.  Pick up a capture group name or number if one follows.
413             // Consume digits so long as the resulting group number <= the number of
414             // number of capture groups in the pattern.
415 
416             int32_t groupNum  = 0;
417             int32_t numDigits = 0;
418             UChar32 nextChar = utext_current32(replacement);
419             if (nextChar == LEFTBRACKET) {
420                 // Scan for a Named Capture Group, ${name}.
421                 UnicodeString groupName;
422                 utext_next32(replacement);
423                 while(U_SUCCESS(status) && nextChar != RIGHTBRACKET) {
424                     nextChar = utext_next32(replacement);
425                     if (nextChar == U_SENTINEL) {
426                         status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
427                     } else if ((nextChar >= 0x41 && nextChar <= 0x5a) ||       // A..Z
428                                (nextChar >= 0x61 && nextChar <= 0x7a) ||       // a..z
429                                (nextChar >= 0x31 && nextChar <= 0x39)) {       // 0..9
430                         groupName.append(nextChar);
431                     } else if (nextChar == RIGHTBRACKET) {
432                         groupNum = uhash_geti(fPattern->fNamedCaptureMap, &groupName);
433                         if (groupNum == 0) {
434                             status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
435                         }
436                     } else {
437                         // Character was something other than a name char or a closing '}'
438                         status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
439                     }
440                 }
441 
442             } else if (u_isdigit(nextChar)) {
443                 // $n    Scan for a capture group number
444                 int32_t numCaptureGroups = fPattern->fGroupMap->size();
445                 for (;;) {
446                     nextChar = UTEXT_CURRENT32(replacement);
447                     if (nextChar == U_SENTINEL) {
448                         break;
449                     }
450                     if (u_isdigit(nextChar) == FALSE) {
451                         break;
452                     }
453                     int32_t nextDigitVal = u_charDigitValue(nextChar);
454                     if (groupNum*10 + nextDigitVal > numCaptureGroups) {
455                         // Don't consume the next digit if it makes the capture group number too big.
456                         if (numDigits == 0) {
457                             status = U_INDEX_OUTOFBOUNDS_ERROR;
458                         }
459                         break;
460                     }
461                     (void)UTEXT_NEXT32(replacement);
462                     groupNum=groupNum*10 + nextDigitVal;
463                     ++numDigits;
464                 }
465             } else {
466                 // $ not followed by capture group name or number.
467                 status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
468             }
469 
470             if (U_SUCCESS(status)) {
471                 destLen += appendGroup(groupNum, dest, status);
472             }
473         }  // End of $ capture group handling
474     }  // End of per-character loop through the replacement string.
475 
476     return *this;
477 }
478 
479 
480 
481 //--------------------------------------------------------------------------------
482 //
483 //    appendTail     Intended to be used in conjunction with appendReplacement()
484 //                   To the destination string, append everything following
485 //                   the last match position from the input string.
486 //
487 //                   Note:  Match ranges do not affect appendTail or appendReplacement
488 //
489 //--------------------------------------------------------------------------------
appendTail(UnicodeString & dest)490 UnicodeString &RegexMatcher::appendTail(UnicodeString &dest) {
491     UErrorCode status = U_ZERO_ERROR;
492     UText resultText = UTEXT_INITIALIZER;
493     utext_openUnicodeString(&resultText, &dest, &status);
494 
495     if (U_SUCCESS(status)) {
496         appendTail(&resultText, status);
497         utext_close(&resultText);
498     }
499 
500     return dest;
501 }
502 
503 //
504 //   appendTail, UText mode
505 //
appendTail(UText * dest,UErrorCode & status)506 UText *RegexMatcher::appendTail(UText *dest, UErrorCode &status) {
507     if (U_FAILURE(status)) {
508         return dest;
509     }
510     if (U_FAILURE(fDeferredStatus)) {
511         status = fDeferredStatus;
512         return dest;
513     }
514 
515     if (fInputLength > fAppendPosition) {
516         if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
517             int64_t destLen = utext_nativeLength(dest);
518             utext_replace(dest, destLen, destLen, fInputText->chunkContents+fAppendPosition,
519                           (int32_t)(fInputLength-fAppendPosition), &status);
520         } else {
521             int32_t len16;
522             if (UTEXT_USES_U16(fInputText)) {
523                 len16 = (int32_t)(fInputLength-fAppendPosition);
524             } else {
525                 len16 = utext_extract(fInputText, fAppendPosition, fInputLength, NULL, 0, &status);
526                 status = U_ZERO_ERROR; // buffer overflow
527             }
528 
529             UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16));
530             if (inputChars == NULL) {
531                 fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
532             } else {
533                 utext_extract(fInputText, fAppendPosition, fInputLength, inputChars, len16, &status); // unterminated
534                 int64_t destLen = utext_nativeLength(dest);
535                 utext_replace(dest, destLen, destLen, inputChars, len16, &status);
536                 uprv_free(inputChars);
537             }
538         }
539     }
540     return dest;
541 }
542 
543 
544 
545 //--------------------------------------------------------------------------------
546 //
547 //   end
548 //
549 //--------------------------------------------------------------------------------
end(UErrorCode & err) const550 int32_t RegexMatcher::end(UErrorCode &err) const {
551     return end(0, err);
552 }
553 
end64(UErrorCode & err) const554 int64_t RegexMatcher::end64(UErrorCode &err) const {
555     return end64(0, err);
556 }
557 
end64(int32_t group,UErrorCode & err) const558 int64_t RegexMatcher::end64(int32_t group, UErrorCode &err) const {
559     if (U_FAILURE(err)) {
560         return -1;
561     }
562     if (fMatch == FALSE) {
563         err = U_REGEX_INVALID_STATE;
564         return -1;
565     }
566     if (group < 0 || group > fPattern->fGroupMap->size()) {
567         err = U_INDEX_OUTOFBOUNDS_ERROR;
568         return -1;
569     }
570     int64_t e = -1;
571     if (group == 0) {
572         e = fMatchEnd;
573     } else {
574         // Get the position within the stack frame of the variables for
575         //    this capture group.
576         int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1);
577         U_ASSERT(groupOffset < fPattern->fFrameSize);
578         U_ASSERT(groupOffset >= 0);
579         e = fFrame->fExtra[groupOffset + 1];
580     }
581 
582         return e;
583 }
584 
end(int32_t group,UErrorCode & err) const585 int32_t RegexMatcher::end(int32_t group, UErrorCode &err) const {
586     return (int32_t)end64(group, err);
587 }
588 
589 //--------------------------------------------------------------------------------
590 //
591 //   findProgressInterrupt  This function is called once for each advance in the target
592 //                          string from the find() function, and calls the user progress callback
593 //                          function if there is one installed.
594 //
595 //         Return:  TRUE if the find operation is to be terminated.
596 //                  FALSE if the find operation is to continue running.
597 //
598 //--------------------------------------------------------------------------------
findProgressInterrupt(int64_t pos,UErrorCode & status)599 UBool RegexMatcher::findProgressInterrupt(int64_t pos, UErrorCode &status) {
600     if (fFindProgressCallbackFn && !(*fFindProgressCallbackFn)(fFindProgressCallbackContext, pos)) {
601         status = U_REGEX_STOPPED_BY_CALLER;
602         return TRUE;
603     }
604     return FALSE;
605 }
606 
607 //--------------------------------------------------------------------------------
608 //
609 //   find()
610 //
611 //--------------------------------------------------------------------------------
find()612 UBool RegexMatcher::find() {
613     if (U_FAILURE(fDeferredStatus)) {
614         return FALSE;
615     }
616     UErrorCode status = U_ZERO_ERROR;
617     UBool result = find(status);
618     return result;
619 }
620 
621 //--------------------------------------------------------------------------------
622 //
623 //   find()
624 //
625 //--------------------------------------------------------------------------------
find(UErrorCode & status)626 UBool RegexMatcher::find(UErrorCode &status) {
627     // Start at the position of the last match end.  (Will be zero if the
628     //   matcher has been reset.)
629     //
630     if (U_FAILURE(status)) {
631         return FALSE;
632     }
633     if (U_FAILURE(fDeferredStatus)) {
634         status = fDeferredStatus;
635         return FALSE;
636     }
637 
638     if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
639         return findUsingChunk(status);
640     }
641 
642     int64_t startPos = fMatchEnd;
643     if (startPos==0) {
644         startPos = fActiveStart;
645     }
646 
647     if (fMatch) {
648         // Save the position of any previous successful match.
649         fLastMatchEnd = fMatchEnd;
650 
651         if (fMatchStart == fMatchEnd) {
652             // Previous match had zero length.  Move start position up one position
653             //  to avoid sending find() into a loop on zero-length matches.
654             if (startPos >= fActiveLimit) {
655                 fMatch = FALSE;
656                 fHitEnd = TRUE;
657                 return FALSE;
658             }
659             UTEXT_SETNATIVEINDEX(fInputText, startPos);
660             (void)UTEXT_NEXT32(fInputText);
661             startPos = UTEXT_GETNATIVEINDEX(fInputText);
662         }
663     } else {
664         if (fLastMatchEnd >= 0) {
665             // A previous find() failed to match.  Don't try again.
666             //   (without this test, a pattern with a zero-length match
667             //    could match again at the end of an input string.)
668             fHitEnd = TRUE;
669             return FALSE;
670         }
671     }
672 
673 
674     // Compute the position in the input string beyond which a match can not begin, because
675     //   the minimum length match would extend past the end of the input.
676     //   Note:  some patterns that cannot match anything will have fMinMatchLength==Max Int.
677     //          Be aware of possible overflows if making changes here.
678     int64_t testStartLimit;
679     if (UTEXT_USES_U16(fInputText)) {
680         testStartLimit = fActiveLimit - fPattern->fMinMatchLen;
681         if (startPos > testStartLimit) {
682             fMatch = FALSE;
683             fHitEnd = TRUE;
684             return FALSE;
685         }
686     } else {
687         // We don't know exactly how long the minimum match length is in native characters.
688         // Treat anything > 0 as 1.
689         testStartLimit = fActiveLimit - (fPattern->fMinMatchLen > 0 ? 1 : 0);
690     }
691 
692     UChar32  c;
693     U_ASSERT(startPos >= 0);
694 
695     switch (fPattern->fStartType) {
696     case START_NO_INFO:
697         // No optimization was found.
698         //  Try a match at each input position.
699         for (;;) {
700             MatchAt(startPos, FALSE, status);
701             if (U_FAILURE(status)) {
702                 return FALSE;
703             }
704             if (fMatch) {
705                 return TRUE;
706             }
707             if (startPos >= testStartLimit) {
708                 fHitEnd = TRUE;
709                 return FALSE;
710             }
711             UTEXT_SETNATIVEINDEX(fInputText, startPos);
712             (void)UTEXT_NEXT32(fInputText);
713             startPos = UTEXT_GETNATIVEINDEX(fInputText);
714             // Note that it's perfectly OK for a pattern to have a zero-length
715             //   match at the end of a string, so we must make sure that the loop
716             //   runs with startPos == testStartLimit the last time through.
717             if  (findProgressInterrupt(startPos, status))
718                 return FALSE;
719         }
720         U_ASSERT(FALSE);
721 
722     case START_START:
723         // Matches are only possible at the start of the input string
724         //   (pattern begins with ^ or \A)
725         if (startPos > fActiveStart) {
726             fMatch = FALSE;
727             return FALSE;
728         }
729         MatchAt(startPos, FALSE, status);
730         if (U_FAILURE(status)) {
731             return FALSE;
732         }
733         return fMatch;
734 
735 
736     case START_SET:
737         {
738             // Match may start on any char from a pre-computed set.
739             U_ASSERT(fPattern->fMinMatchLen > 0);
740             UTEXT_SETNATIVEINDEX(fInputText, startPos);
741             for (;;) {
742                 int64_t pos = startPos;
743                 c = UTEXT_NEXT32(fInputText);
744                 startPos = UTEXT_GETNATIVEINDEX(fInputText);
745                 // c will be -1 (U_SENTINEL) at end of text, in which case we
746                 // skip this next block (so we don't have a negative array index)
747                 // and handle end of text in the following block.
748                 if (c >= 0 && ((c<256 && fPattern->fInitialChars8->contains(c)) ||
749                               (c>=256 && fPattern->fInitialChars->contains(c)))) {
750                     MatchAt(pos, FALSE, status);
751                     if (U_FAILURE(status)) {
752                         return FALSE;
753                     }
754                     if (fMatch) {
755                         return TRUE;
756                     }
757                     UTEXT_SETNATIVEINDEX(fInputText, pos);
758                 }
759                 if (startPos > testStartLimit) {
760                     fMatch = FALSE;
761                     fHitEnd = TRUE;
762                     return FALSE;
763                 }
764                 if  (findProgressInterrupt(startPos, status))
765                     return FALSE;
766             }
767         }
768         U_ASSERT(FALSE);
769 
770     case START_STRING:
771     case START_CHAR:
772         {
773             // Match starts on exactly one char.
774             U_ASSERT(fPattern->fMinMatchLen > 0);
775             UChar32 theChar = fPattern->fInitialChar;
776             UTEXT_SETNATIVEINDEX(fInputText, startPos);
777             for (;;) {
778                 int64_t pos = startPos;
779                 c = UTEXT_NEXT32(fInputText);
780                 startPos = UTEXT_GETNATIVEINDEX(fInputText);
781                 if (c == theChar) {
782                     MatchAt(pos, FALSE, status);
783                     if (U_FAILURE(status)) {
784                         return FALSE;
785                     }
786                     if (fMatch) {
787                         return TRUE;
788                     }
789                     UTEXT_SETNATIVEINDEX(fInputText, startPos);
790                 }
791                 if (startPos > testStartLimit) {
792                     fMatch = FALSE;
793                     fHitEnd = TRUE;
794                     return FALSE;
795                 }
796                 if  (findProgressInterrupt(startPos, status))
797                     return FALSE;
798            }
799         }
800         U_ASSERT(FALSE);
801 
802     case START_LINE:
803         {
804             UChar32 ch;
805             if (startPos == fAnchorStart) {
806                 MatchAt(startPos, FALSE, status);
807                 if (U_FAILURE(status)) {
808                     return FALSE;
809                 }
810                 if (fMatch) {
811                     return TRUE;
812                 }
813                 UTEXT_SETNATIVEINDEX(fInputText, startPos);
814                 ch = UTEXT_NEXT32(fInputText);
815                 startPos = UTEXT_GETNATIVEINDEX(fInputText);
816             } else {
817                 UTEXT_SETNATIVEINDEX(fInputText, startPos);
818                 ch = UTEXT_PREVIOUS32(fInputText);
819                 UTEXT_SETNATIVEINDEX(fInputText, startPos);
820             }
821 
822             if (fPattern->fFlags & UREGEX_UNIX_LINES) {
823                 for (;;) {
824                     if (ch == 0x0a) {
825                             MatchAt(startPos, FALSE, status);
826                             if (U_FAILURE(status)) {
827                                 return FALSE;
828                             }
829                             if (fMatch) {
830                                 return TRUE;
831                             }
832                             UTEXT_SETNATIVEINDEX(fInputText, startPos);
833                     }
834                     if (startPos >= testStartLimit) {
835                         fMatch = FALSE;
836                         fHitEnd = TRUE;
837                         return FALSE;
838                     }
839                     ch = UTEXT_NEXT32(fInputText);
840                     startPos = UTEXT_GETNATIVEINDEX(fInputText);
841                     // Note that it's perfectly OK for a pattern to have a zero-length
842                     //   match at the end of a string, so we must make sure that the loop
843                     //   runs with startPos == testStartLimit the last time through.
844                     if  (findProgressInterrupt(startPos, status))
845                         return FALSE;
846                 }
847             } else {
848                 for (;;) {
849                     if (isLineTerminator(ch)) {
850                         if (ch == 0x0d && startPos < fActiveLimit && UTEXT_CURRENT32(fInputText) == 0x0a) {
851                             (void)UTEXT_NEXT32(fInputText);
852                             startPos = UTEXT_GETNATIVEINDEX(fInputText);
853                         }
854                         MatchAt(startPos, FALSE, status);
855                         if (U_FAILURE(status)) {
856                             return FALSE;
857                         }
858                         if (fMatch) {
859                             return TRUE;
860                         }
861                         UTEXT_SETNATIVEINDEX(fInputText, startPos);
862                     }
863                     if (startPos >= testStartLimit) {
864                         fMatch = FALSE;
865                         fHitEnd = TRUE;
866                         return FALSE;
867                     }
868                     ch = UTEXT_NEXT32(fInputText);
869                     startPos = UTEXT_GETNATIVEINDEX(fInputText);
870                     // Note that it's perfectly OK for a pattern to have a zero-length
871                     //   match at the end of a string, so we must make sure that the loop
872                     //   runs with startPos == testStartLimit the last time through.
873                     if  (findProgressInterrupt(startPos, status))
874                         return FALSE;
875                 }
876             }
877         }
878 
879     default:
880         U_ASSERT(FALSE);
881     }
882 
883     U_ASSERT(FALSE);
884     return FALSE;
885 }
886 
887 
888 
find(int64_t start,UErrorCode & status)889 UBool RegexMatcher::find(int64_t start, UErrorCode &status) {
890     if (U_FAILURE(status)) {
891         return FALSE;
892     }
893     if (U_FAILURE(fDeferredStatus)) {
894         status = fDeferredStatus;
895         return FALSE;
896     }
897     this->reset();                        // Note:  Reset() is specified by Java Matcher documentation.
898                                           //        This will reset the region to be the full input length.
899     if (start < 0) {
900         status = U_INDEX_OUTOFBOUNDS_ERROR;
901         return FALSE;
902     }
903 
904     int64_t nativeStart = start;
905     if (nativeStart < fActiveStart || nativeStart > fActiveLimit) {
906         status = U_INDEX_OUTOFBOUNDS_ERROR;
907         return FALSE;
908     }
909     fMatchEnd = nativeStart;
910     return find(status);
911 }
912 
913 
914 //--------------------------------------------------------------------------------
915 //
916 //   findUsingChunk() -- like find(), but with the advance knowledge that the
917 //                       entire string is available in the UText's chunk buffer.
918 //
919 //--------------------------------------------------------------------------------
findUsingChunk(UErrorCode & status)920 UBool RegexMatcher::findUsingChunk(UErrorCode &status) {
921     // Start at the position of the last match end.  (Will be zero if the
922     //   matcher has been reset.
923     //
924 
925     int32_t startPos = (int32_t)fMatchEnd;
926     if (startPos==0) {
927         startPos = (int32_t)fActiveStart;
928     }
929 
930     const UChar *inputBuf = fInputText->chunkContents;
931 
932     if (fMatch) {
933         // Save the position of any previous successful match.
934         fLastMatchEnd = fMatchEnd;
935 
936         if (fMatchStart == fMatchEnd) {
937             // Previous match had zero length.  Move start position up one position
938             //  to avoid sending find() into a loop on zero-length matches.
939             if (startPos >= fActiveLimit) {
940                 fMatch = FALSE;
941                 fHitEnd = TRUE;
942                 return FALSE;
943             }
944             U16_FWD_1(inputBuf, startPos, fInputLength);
945         }
946     } else {
947         if (fLastMatchEnd >= 0) {
948             // A previous find() failed to match.  Don't try again.
949             //   (without this test, a pattern with a zero-length match
950             //    could match again at the end of an input string.)
951             fHitEnd = TRUE;
952             return FALSE;
953         }
954     }
955 
956 
957     // Compute the position in the input string beyond which a match can not begin, because
958     //   the minimum length match would extend past the end of the input.
959     //   Note:  some patterns that cannot match anything will have fMinMatchLength==Max Int.
960     //          Be aware of possible overflows if making changes here.
961     //   Note:  a match can begin at inputBuf + testLen; it is an inclusive limit.
962     int32_t testLen  = (int32_t)(fActiveLimit - fPattern->fMinMatchLen);
963     if (startPos > testLen) {
964         fMatch = FALSE;
965         fHitEnd = TRUE;
966         return FALSE;
967     }
968 
969     UChar32  c;
970     U_ASSERT(startPos >= 0);
971 
972     switch (fPattern->fStartType) {
973     case START_NO_INFO:
974         // No optimization was found.
975         //  Try a match at each input position.
976         for (;;) {
977             MatchChunkAt(startPos, FALSE, status);
978             if (U_FAILURE(status)) {
979                 return FALSE;
980             }
981             if (fMatch) {
982                 return TRUE;
983             }
984             if (startPos >= testLen) {
985                 fHitEnd = TRUE;
986                 return FALSE;
987             }
988             U16_FWD_1(inputBuf, startPos, fActiveLimit);
989             // Note that it's perfectly OK for a pattern to have a zero-length
990             //   match at the end of a string, so we must make sure that the loop
991             //   runs with startPos == testLen the last time through.
992             if  (findProgressInterrupt(startPos, status))
993                 return FALSE;
994         }
995         U_ASSERT(FALSE);
996 
997     case START_START:
998         // Matches are only possible at the start of the input string
999         //   (pattern begins with ^ or \A)
1000         if (startPos > fActiveStart) {
1001             fMatch = FALSE;
1002             return FALSE;
1003         }
1004         MatchChunkAt(startPos, FALSE, status);
1005         if (U_FAILURE(status)) {
1006             return FALSE;
1007         }
1008         return fMatch;
1009 
1010 
1011     case START_SET:
1012     {
1013         // Match may start on any char from a pre-computed set.
1014         U_ASSERT(fPattern->fMinMatchLen > 0);
1015         for (;;) {
1016             int32_t pos = startPos;
1017             U16_NEXT(inputBuf, startPos, fActiveLimit, c);  // like c = inputBuf[startPos++];
1018             if ((c<256 && fPattern->fInitialChars8->contains(c)) ||
1019                 (c>=256 && fPattern->fInitialChars->contains(c))) {
1020                 MatchChunkAt(pos, FALSE, status);
1021                 if (U_FAILURE(status)) {
1022                     return FALSE;
1023                 }
1024                 if (fMatch) {
1025                     return TRUE;
1026                 }
1027             }
1028             if (startPos > testLen) {
1029                 fMatch = FALSE;
1030                 fHitEnd = TRUE;
1031                 return FALSE;
1032             }
1033             if  (findProgressInterrupt(startPos, status))
1034                 return FALSE;
1035         }
1036     }
1037     U_ASSERT(FALSE);
1038 
1039     case START_STRING:
1040     case START_CHAR:
1041     {
1042         // Match starts on exactly one char.
1043         U_ASSERT(fPattern->fMinMatchLen > 0);
1044         UChar32 theChar = fPattern->fInitialChar;
1045         for (;;) {
1046             int32_t pos = startPos;
1047             U16_NEXT(inputBuf, startPos, fActiveLimit, c);  // like c = inputBuf[startPos++];
1048             if (c == theChar) {
1049                 MatchChunkAt(pos, FALSE, status);
1050                 if (U_FAILURE(status)) {
1051                     return FALSE;
1052                 }
1053                 if (fMatch) {
1054                     return TRUE;
1055                 }
1056             }
1057             if (startPos > testLen) {
1058                 fMatch = FALSE;
1059                 fHitEnd = TRUE;
1060                 return FALSE;
1061             }
1062             if  (findProgressInterrupt(startPos, status))
1063                 return FALSE;
1064         }
1065     }
1066     U_ASSERT(FALSE);
1067 
1068     case START_LINE:
1069     {
1070         UChar32 ch;
1071         if (startPos == fAnchorStart) {
1072             MatchChunkAt(startPos, FALSE, status);
1073             if (U_FAILURE(status)) {
1074                 return FALSE;
1075             }
1076             if (fMatch) {
1077                 return TRUE;
1078             }
1079             U16_FWD_1(inputBuf, startPos, fActiveLimit);
1080         }
1081 
1082         if (fPattern->fFlags & UREGEX_UNIX_LINES) {
1083             for (;;) {
1084                 ch = inputBuf[startPos-1];
1085                 if (ch == 0x0a) {
1086                     MatchChunkAt(startPos, FALSE, status);
1087                     if (U_FAILURE(status)) {
1088                         return FALSE;
1089                     }
1090                     if (fMatch) {
1091                         return TRUE;
1092                     }
1093                 }
1094                 if (startPos >= testLen) {
1095                     fMatch = FALSE;
1096                     fHitEnd = TRUE;
1097                     return FALSE;
1098                 }
1099                 U16_FWD_1(inputBuf, startPos, fActiveLimit);
1100                 // Note that it's perfectly OK for a pattern to have a zero-length
1101                 //   match at the end of a string, so we must make sure that the loop
1102                 //   runs with startPos == testLen the last time through.
1103                 if  (findProgressInterrupt(startPos, status))
1104                     return FALSE;
1105             }
1106         } else {
1107             for (;;) {
1108                 ch = inputBuf[startPos-1];
1109                 if (isLineTerminator(ch)) {
1110                     if (ch == 0x0d && startPos < fActiveLimit && inputBuf[startPos] == 0x0a) {
1111                         startPos++;
1112                     }
1113                     MatchChunkAt(startPos, FALSE, status);
1114                     if (U_FAILURE(status)) {
1115                         return FALSE;
1116                     }
1117                     if (fMatch) {
1118                         return TRUE;
1119                     }
1120                 }
1121                 if (startPos >= testLen) {
1122                     fMatch = FALSE;
1123                     fHitEnd = TRUE;
1124                     return FALSE;
1125                 }
1126                 U16_FWD_1(inputBuf, startPos, fActiveLimit);
1127                 // Note that it's perfectly OK for a pattern to have a zero-length
1128                 //   match at the end of a string, so we must make sure that the loop
1129                 //   runs with startPos == testLen the last time through.
1130                 if  (findProgressInterrupt(startPos, status))
1131                     return FALSE;
1132             }
1133         }
1134     }
1135 
1136     default:
1137         U_ASSERT(FALSE);
1138     }
1139 
1140     U_ASSERT(FALSE);
1141     return FALSE;
1142 }
1143 
1144 
1145 
1146 //--------------------------------------------------------------------------------
1147 //
1148 //  group()
1149 //
1150 //--------------------------------------------------------------------------------
group(UErrorCode & status) const1151 UnicodeString RegexMatcher::group(UErrorCode &status) const {
1152     return group(0, status);
1153 }
1154 
1155 //  Return immutable shallow clone
group(UText * dest,int64_t & group_len,UErrorCode & status) const1156 UText *RegexMatcher::group(UText *dest, int64_t &group_len, UErrorCode &status) const {
1157     return group(0, dest, group_len, status);
1158 }
1159 
1160 //  Return immutable shallow clone
group(int32_t groupNum,UText * dest,int64_t & group_len,UErrorCode & status) const1161 UText *RegexMatcher::group(int32_t groupNum, UText *dest, int64_t &group_len, UErrorCode &status) const {
1162     group_len = 0;
1163     if (U_FAILURE(status)) {
1164         return dest;
1165     }
1166     if (U_FAILURE(fDeferredStatus)) {
1167         status = fDeferredStatus;
1168     } else if (fMatch == FALSE) {
1169         status = U_REGEX_INVALID_STATE;
1170     } else if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) {
1171         status = U_INDEX_OUTOFBOUNDS_ERROR;
1172     }
1173 
1174     if (U_FAILURE(status)) {
1175         return dest;
1176     }
1177 
1178     int64_t s, e;
1179     if (groupNum == 0) {
1180         s = fMatchStart;
1181         e = fMatchEnd;
1182     } else {
1183         int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1);
1184         U_ASSERT(groupOffset < fPattern->fFrameSize);
1185         U_ASSERT(groupOffset >= 0);
1186         s = fFrame->fExtra[groupOffset];
1187         e = fFrame->fExtra[groupOffset+1];
1188     }
1189 
1190     if (s < 0) {
1191         // A capture group wasn't part of the match
1192         return utext_clone(dest, fInputText, FALSE, TRUE, &status);
1193     }
1194     U_ASSERT(s <= e);
1195     group_len = e - s;
1196 
1197     dest = utext_clone(dest, fInputText, FALSE, TRUE, &status);
1198     if (dest)
1199         UTEXT_SETNATIVEINDEX(dest, s);
1200     return dest;
1201 }
1202 
group(int32_t groupNum,UErrorCode & status) const1203 UnicodeString RegexMatcher::group(int32_t groupNum, UErrorCode &status) const {
1204     UnicodeString result;
1205     int64_t groupStart = start64(groupNum, status);
1206     int64_t groupEnd = end64(groupNum, status);
1207     if (U_FAILURE(status) || groupStart == -1 || groupStart == groupEnd) {
1208         return result;
1209     }
1210 
1211     // Get the group length using a utext_extract preflight.
1212     //    UText is actually pretty efficient at this when underlying encoding is UTF-16.
1213     int32_t length = utext_extract(fInputText, groupStart, groupEnd, NULL, 0, &status);
1214     if (status != U_BUFFER_OVERFLOW_ERROR) {
1215         return result;
1216     }
1217 
1218     status = U_ZERO_ERROR;
1219     UChar *buf = result.getBuffer(length);
1220     if (buf == NULL) {
1221         status = U_MEMORY_ALLOCATION_ERROR;
1222     } else {
1223         int32_t extractLength = utext_extract(fInputText, groupStart, groupEnd, buf, length, &status);
1224         result.releaseBuffer(extractLength);
1225         U_ASSERT(length == extractLength);
1226     }
1227     return result;
1228 }
1229 
1230 
1231 //--------------------------------------------------------------------------------
1232 //
1233 //  appendGroup() -- currently internal only, appends a group to a UText rather
1234 //                   than replacing its contents
1235 //
1236 //--------------------------------------------------------------------------------
1237 
appendGroup(int32_t groupNum,UText * dest,UErrorCode & status) const1238 int64_t RegexMatcher::appendGroup(int32_t groupNum, UText *dest, UErrorCode &status) const {
1239     if (U_FAILURE(status)) {
1240         return 0;
1241     }
1242     if (U_FAILURE(fDeferredStatus)) {
1243         status = fDeferredStatus;
1244         return 0;
1245     }
1246     int64_t destLen = utext_nativeLength(dest);
1247 
1248     if (fMatch == FALSE) {
1249         status = U_REGEX_INVALID_STATE;
1250         return utext_replace(dest, destLen, destLen, NULL, 0, &status);
1251     }
1252     if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) {
1253         status = U_INDEX_OUTOFBOUNDS_ERROR;
1254         return utext_replace(dest, destLen, destLen, NULL, 0, &status);
1255     }
1256 
1257     int64_t s, e;
1258     if (groupNum == 0) {
1259         s = fMatchStart;
1260         e = fMatchEnd;
1261     } else {
1262         int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1);
1263         U_ASSERT(groupOffset < fPattern->fFrameSize);
1264         U_ASSERT(groupOffset >= 0);
1265         s = fFrame->fExtra[groupOffset];
1266         e = fFrame->fExtra[groupOffset+1];
1267     }
1268 
1269     if (s < 0) {
1270         // A capture group wasn't part of the match
1271         return utext_replace(dest, destLen, destLen, NULL, 0, &status);
1272     }
1273     U_ASSERT(s <= e);
1274 
1275     int64_t deltaLen;
1276     if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1277         U_ASSERT(e <= fInputLength);
1278         deltaLen = utext_replace(dest, destLen, destLen, fInputText->chunkContents+s, (int32_t)(e-s), &status);
1279     } else {
1280         int32_t len16;
1281         if (UTEXT_USES_U16(fInputText)) {
1282             len16 = (int32_t)(e-s);
1283         } else {
1284             UErrorCode lengthStatus = U_ZERO_ERROR;
1285             len16 = utext_extract(fInputText, s, e, NULL, 0, &lengthStatus);
1286         }
1287         UChar *groupChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1));
1288         if (groupChars == NULL) {
1289             status = U_MEMORY_ALLOCATION_ERROR;
1290             return 0;
1291         }
1292         utext_extract(fInputText, s, e, groupChars, len16+1, &status);
1293 
1294         deltaLen = utext_replace(dest, destLen, destLen, groupChars, len16, &status);
1295         uprv_free(groupChars);
1296     }
1297     return deltaLen;
1298 }
1299 
1300 
1301 
1302 //--------------------------------------------------------------------------------
1303 //
1304 //  groupCount()
1305 //
1306 //--------------------------------------------------------------------------------
groupCount() const1307 int32_t RegexMatcher::groupCount() const {
1308     return fPattern->fGroupMap->size();
1309 }
1310 
1311 //--------------------------------------------------------------------------------
1312 //
1313 //  hasAnchoringBounds()
1314 //
1315 //--------------------------------------------------------------------------------
hasAnchoringBounds() const1316 UBool RegexMatcher::hasAnchoringBounds() const {
1317     return fAnchoringBounds;
1318 }
1319 
1320 
1321 //--------------------------------------------------------------------------------
1322 //
1323 //  hasTransparentBounds()
1324 //
1325 //--------------------------------------------------------------------------------
hasTransparentBounds() const1326 UBool RegexMatcher::hasTransparentBounds() const {
1327     return fTransparentBounds;
1328 }
1329 
1330 
1331 
1332 //--------------------------------------------------------------------------------
1333 //
1334 //  hitEnd()
1335 //
1336 //--------------------------------------------------------------------------------
hitEnd() const1337 UBool RegexMatcher::hitEnd() const {
1338     return fHitEnd;
1339 }
1340 
1341 
1342 //--------------------------------------------------------------------------------
1343 //
1344 //  input()
1345 //
1346 //--------------------------------------------------------------------------------
input() const1347 const UnicodeString &RegexMatcher::input() const {
1348     if (!fInput) {
1349         UErrorCode status = U_ZERO_ERROR;
1350         int32_t len16;
1351         if (UTEXT_USES_U16(fInputText)) {
1352             len16 = (int32_t)fInputLength;
1353         } else {
1354             len16 = utext_extract(fInputText, 0, fInputLength, NULL, 0, &status);
1355             status = U_ZERO_ERROR; // overflow, length status
1356         }
1357         UnicodeString *result = new UnicodeString(len16, 0, 0);
1358 
1359         UChar *inputChars = result->getBuffer(len16);
1360         utext_extract(fInputText, 0, fInputLength, inputChars, len16, &status); // unterminated warning
1361         result->releaseBuffer(len16);
1362 
1363         (*(const UnicodeString **)&fInput) = result; // pointer assignment, rather than operator=
1364     }
1365 
1366     return *fInput;
1367 }
1368 
1369 //--------------------------------------------------------------------------------
1370 //
1371 //  inputText()
1372 //
1373 //--------------------------------------------------------------------------------
inputText() const1374 UText *RegexMatcher::inputText() const {
1375     return fInputText;
1376 }
1377 
1378 
1379 //--------------------------------------------------------------------------------
1380 //
1381 //  getInput() -- like inputText(), but makes a clone or copies into another UText
1382 //
1383 //--------------------------------------------------------------------------------
getInput(UText * dest,UErrorCode & status) const1384 UText *RegexMatcher::getInput (UText *dest, UErrorCode &status) const {
1385     if (U_FAILURE(status)) {
1386         return dest;
1387     }
1388     if (U_FAILURE(fDeferredStatus)) {
1389         status = fDeferredStatus;
1390         return dest;
1391     }
1392 
1393     if (dest) {
1394         if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1395             utext_replace(dest, 0, utext_nativeLength(dest), fInputText->chunkContents, (int32_t)fInputLength, &status);
1396         } else {
1397             int32_t input16Len;
1398             if (UTEXT_USES_U16(fInputText)) {
1399                 input16Len = (int32_t)fInputLength;
1400             } else {
1401                 UErrorCode lengthStatus = U_ZERO_ERROR;
1402                 input16Len = utext_extract(fInputText, 0, fInputLength, NULL, 0, &lengthStatus); // buffer overflow error
1403             }
1404             UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(input16Len));
1405             if (inputChars == NULL) {
1406                 return dest;
1407             }
1408 
1409             status = U_ZERO_ERROR;
1410             utext_extract(fInputText, 0, fInputLength, inputChars, input16Len, &status); // not terminated warning
1411             status = U_ZERO_ERROR;
1412             utext_replace(dest, 0, utext_nativeLength(dest), inputChars, input16Len, &status);
1413 
1414             uprv_free(inputChars);
1415         }
1416         return dest;
1417     } else {
1418         return utext_clone(NULL, fInputText, FALSE, TRUE, &status);
1419     }
1420 }
1421 
1422 
1423 static UBool compat_SyncMutableUTextContents(UText *ut);
compat_SyncMutableUTextContents(UText * ut)1424 static UBool compat_SyncMutableUTextContents(UText *ut) {
1425     UBool retVal = FALSE;
1426 
1427     //  In the following test, we're really only interested in whether the UText should switch
1428     //  between heap and stack allocation.  If length hasn't changed, we won't, so the chunkContents
1429     //  will still point to the correct data.
1430     if (utext_nativeLength(ut) != ut->nativeIndexingLimit) {
1431         UnicodeString *us=(UnicodeString *)ut->context;
1432 
1433         // Update to the latest length.
1434         // For example, (utext_nativeLength(ut) != ut->nativeIndexingLimit).
1435         int32_t newLength = us->length();
1436 
1437         // Update the chunk description.
1438         // The buffer may have switched between stack- and heap-based.
1439         ut->chunkContents    = us->getBuffer();
1440         ut->chunkLength      = newLength;
1441         ut->chunkNativeLimit = newLength;
1442         ut->nativeIndexingLimit = newLength;
1443         retVal = TRUE;
1444     }
1445 
1446     return retVal;
1447 }
1448 
1449 //--------------------------------------------------------------------------------
1450 //
1451 //  lookingAt()
1452 //
1453 //--------------------------------------------------------------------------------
lookingAt(UErrorCode & status)1454 UBool RegexMatcher::lookingAt(UErrorCode &status) {
1455     if (U_FAILURE(status)) {
1456         return FALSE;
1457     }
1458     if (U_FAILURE(fDeferredStatus)) {
1459         status = fDeferredStatus;
1460         return FALSE;
1461     }
1462 
1463     if (fInputUniStrMaybeMutable) {
1464         if (compat_SyncMutableUTextContents(fInputText)) {
1465         fInputLength = utext_nativeLength(fInputText);
1466         reset();
1467         }
1468     }
1469     else {
1470         resetPreserveRegion();
1471     }
1472     if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1473         MatchChunkAt((int32_t)fActiveStart, FALSE, status);
1474     } else {
1475         MatchAt(fActiveStart, FALSE, status);
1476     }
1477     return fMatch;
1478 }
1479 
1480 
lookingAt(int64_t start,UErrorCode & status)1481 UBool RegexMatcher::lookingAt(int64_t start, UErrorCode &status) {
1482     if (U_FAILURE(status)) {
1483         return FALSE;
1484     }
1485     if (U_FAILURE(fDeferredStatus)) {
1486         status = fDeferredStatus;
1487         return FALSE;
1488     }
1489     reset();
1490 
1491     if (start < 0) {
1492         status = U_INDEX_OUTOFBOUNDS_ERROR;
1493         return FALSE;
1494     }
1495 
1496     if (fInputUniStrMaybeMutable) {
1497         if (compat_SyncMutableUTextContents(fInputText)) {
1498         fInputLength = utext_nativeLength(fInputText);
1499         reset();
1500         }
1501     }
1502 
1503     int64_t nativeStart;
1504     nativeStart = start;
1505     if (nativeStart < fActiveStart || nativeStart > fActiveLimit) {
1506         status = U_INDEX_OUTOFBOUNDS_ERROR;
1507         return FALSE;
1508     }
1509 
1510     if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1511         MatchChunkAt((int32_t)nativeStart, FALSE, status);
1512     } else {
1513         MatchAt(nativeStart, FALSE, status);
1514     }
1515     return fMatch;
1516 }
1517 
1518 
1519 
1520 //--------------------------------------------------------------------------------
1521 //
1522 //  matches()
1523 //
1524 //--------------------------------------------------------------------------------
matches(UErrorCode & status)1525 UBool RegexMatcher::matches(UErrorCode &status) {
1526     if (U_FAILURE(status)) {
1527         return FALSE;
1528     }
1529     if (U_FAILURE(fDeferredStatus)) {
1530         status = fDeferredStatus;
1531         return FALSE;
1532     }
1533 
1534     if (fInputUniStrMaybeMutable) {
1535         if (compat_SyncMutableUTextContents(fInputText)) {
1536         fInputLength = utext_nativeLength(fInputText);
1537         reset();
1538         }
1539     }
1540     else {
1541         resetPreserveRegion();
1542     }
1543 
1544     if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1545         MatchChunkAt((int32_t)fActiveStart, TRUE, status);
1546     } else {
1547         MatchAt(fActiveStart, TRUE, status);
1548     }
1549     return fMatch;
1550 }
1551 
1552 
matches(int64_t start,UErrorCode & status)1553 UBool RegexMatcher::matches(int64_t start, UErrorCode &status) {
1554     if (U_FAILURE(status)) {
1555         return FALSE;
1556     }
1557     if (U_FAILURE(fDeferredStatus)) {
1558         status = fDeferredStatus;
1559         return FALSE;
1560     }
1561     reset();
1562 
1563     if (start < 0) {
1564         status = U_INDEX_OUTOFBOUNDS_ERROR;
1565         return FALSE;
1566     }
1567 
1568     if (fInputUniStrMaybeMutable) {
1569         if (compat_SyncMutableUTextContents(fInputText)) {
1570         fInputLength = utext_nativeLength(fInputText);
1571         reset();
1572         }
1573     }
1574 
1575     int64_t nativeStart;
1576     nativeStart = start;
1577     if (nativeStart < fActiveStart || nativeStart > fActiveLimit) {
1578         status = U_INDEX_OUTOFBOUNDS_ERROR;
1579         return FALSE;
1580     }
1581 
1582     if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1583         MatchChunkAt((int32_t)nativeStart, TRUE, status);
1584     } else {
1585         MatchAt(nativeStart, TRUE, status);
1586     }
1587     return fMatch;
1588 }
1589 
1590 
1591 
1592 //--------------------------------------------------------------------------------
1593 //
1594 //    pattern
1595 //
1596 //--------------------------------------------------------------------------------
pattern() const1597 const RegexPattern &RegexMatcher::pattern() const {
1598     return *fPattern;
1599 }
1600 
1601 
1602 
1603 //--------------------------------------------------------------------------------
1604 //
1605 //    region
1606 //
1607 //--------------------------------------------------------------------------------
region(int64_t regionStart,int64_t regionLimit,int64_t startIndex,UErrorCode & status)1608 RegexMatcher &RegexMatcher::region(int64_t regionStart, int64_t regionLimit, int64_t startIndex, UErrorCode &status) {
1609     if (U_FAILURE(status)) {
1610         return *this;
1611     }
1612 
1613     if (regionStart>regionLimit || regionStart<0 || regionLimit<0) {
1614         status = U_ILLEGAL_ARGUMENT_ERROR;
1615     }
1616 
1617     int64_t nativeStart = regionStart;
1618     int64_t nativeLimit = regionLimit;
1619     if (nativeStart > fInputLength || nativeLimit > fInputLength) {
1620       status = U_ILLEGAL_ARGUMENT_ERROR;
1621     }
1622 
1623     if (startIndex == -1)
1624       this->reset();
1625     else
1626       resetPreserveRegion();
1627 
1628     fRegionStart = nativeStart;
1629     fRegionLimit = nativeLimit;
1630     fActiveStart = nativeStart;
1631     fActiveLimit = nativeLimit;
1632 
1633     if (startIndex != -1) {
1634       if (startIndex < fActiveStart || startIndex > fActiveLimit) {
1635           status = U_INDEX_OUTOFBOUNDS_ERROR;
1636       }
1637       fMatchEnd = startIndex;
1638     }
1639 
1640     if (!fTransparentBounds) {
1641         fLookStart = nativeStart;
1642         fLookLimit = nativeLimit;
1643     }
1644     if (fAnchoringBounds) {
1645         fAnchorStart = nativeStart;
1646         fAnchorLimit = nativeLimit;
1647     }
1648     return *this;
1649 }
1650 
region(int64_t start,int64_t limit,UErrorCode & status)1651 RegexMatcher &RegexMatcher::region(int64_t start, int64_t limit, UErrorCode &status) {
1652   return region(start, limit, -1, status);
1653 }
1654 
1655 //--------------------------------------------------------------------------------
1656 //
1657 //    regionEnd
1658 //
1659 //--------------------------------------------------------------------------------
regionEnd() const1660 int32_t RegexMatcher::regionEnd() const {
1661     return (int32_t)fRegionLimit;
1662 }
1663 
regionEnd64() const1664 int64_t RegexMatcher::regionEnd64() const {
1665     return fRegionLimit;
1666 }
1667 
1668 //--------------------------------------------------------------------------------
1669 //
1670 //    regionStart
1671 //
1672 //--------------------------------------------------------------------------------
regionStart() const1673 int32_t RegexMatcher::regionStart() const {
1674     return (int32_t)fRegionStart;
1675 }
1676 
regionStart64() const1677 int64_t RegexMatcher::regionStart64() const {
1678     return fRegionStart;
1679 }
1680 
1681 
1682 //--------------------------------------------------------------------------------
1683 //
1684 //    replaceAll
1685 //
1686 //--------------------------------------------------------------------------------
replaceAll(const UnicodeString & replacement,UErrorCode & status)1687 UnicodeString RegexMatcher::replaceAll(const UnicodeString &replacement, UErrorCode &status) {
1688     UText replacementText = UTEXT_INITIALIZER;
1689     UText resultText = UTEXT_INITIALIZER;
1690     UnicodeString resultString;
1691     if (U_FAILURE(status)) {
1692         return resultString;
1693     }
1694 
1695     utext_openConstUnicodeString(&replacementText, &replacement, &status);
1696     utext_openUnicodeString(&resultText, &resultString, &status);
1697 
1698     replaceAll(&replacementText, &resultText, status);
1699 
1700     utext_close(&resultText);
1701     utext_close(&replacementText);
1702 
1703     return resultString;
1704 }
1705 
1706 
1707 //
1708 //    replaceAll, UText mode
1709 //
replaceAll(UText * replacement,UText * dest,UErrorCode & status)1710 UText *RegexMatcher::replaceAll(UText *replacement, UText *dest, UErrorCode &status) {
1711     if (U_FAILURE(status)) {
1712         return dest;
1713     }
1714     if (U_FAILURE(fDeferredStatus)) {
1715         status = fDeferredStatus;
1716         return dest;
1717     }
1718 
1719     if (dest == NULL) {
1720         UnicodeString emptyString;
1721         UText empty = UTEXT_INITIALIZER;
1722 
1723         utext_openUnicodeString(&empty, &emptyString, &status);
1724         dest = utext_clone(NULL, &empty, TRUE, FALSE, &status);
1725         utext_close(&empty);
1726     }
1727 
1728     if (U_SUCCESS(status)) {
1729         reset();
1730         while (find()) {
1731             appendReplacement(dest, replacement, status);
1732             if (U_FAILURE(status)) {
1733                 break;
1734             }
1735         }
1736         appendTail(dest, status);
1737     }
1738 
1739     return dest;
1740 }
1741 
1742 
1743 //--------------------------------------------------------------------------------
1744 //
1745 //    replaceFirst
1746 //
1747 //--------------------------------------------------------------------------------
replaceFirst(const UnicodeString & replacement,UErrorCode & status)1748 UnicodeString RegexMatcher::replaceFirst(const UnicodeString &replacement, UErrorCode &status) {
1749     UText replacementText = UTEXT_INITIALIZER;
1750     UText resultText = UTEXT_INITIALIZER;
1751     UnicodeString resultString;
1752 
1753     utext_openConstUnicodeString(&replacementText, &replacement, &status);
1754     utext_openUnicodeString(&resultText, &resultString, &status);
1755 
1756     replaceFirst(&replacementText, &resultText, status);
1757 
1758     utext_close(&resultText);
1759     utext_close(&replacementText);
1760 
1761     return resultString;
1762 }
1763 
1764 //
1765 //    replaceFirst, UText mode
1766 //
replaceFirst(UText * replacement,UText * dest,UErrorCode & status)1767 UText *RegexMatcher::replaceFirst(UText *replacement, UText *dest, UErrorCode &status) {
1768     if (U_FAILURE(status)) {
1769         return dest;
1770     }
1771     if (U_FAILURE(fDeferredStatus)) {
1772         status = fDeferredStatus;
1773         return dest;
1774     }
1775 
1776     reset();
1777     if (!find()) {
1778         return getInput(dest, status);
1779     }
1780 
1781     if (dest == NULL) {
1782         UnicodeString emptyString;
1783         UText empty = UTEXT_INITIALIZER;
1784 
1785         utext_openUnicodeString(&empty, &emptyString, &status);
1786         dest = utext_clone(NULL, &empty, TRUE, FALSE, &status);
1787         utext_close(&empty);
1788     }
1789 
1790     appendReplacement(dest, replacement, status);
1791     appendTail(dest, status);
1792 
1793     return dest;
1794 }
1795 
1796 
1797 //--------------------------------------------------------------------------------
1798 //
1799 //     requireEnd
1800 //
1801 //--------------------------------------------------------------------------------
requireEnd() const1802 UBool RegexMatcher::requireEnd() const {
1803     return fRequireEnd;
1804 }
1805 
1806 
1807 //--------------------------------------------------------------------------------
1808 //
1809 //     reset
1810 //
1811 //--------------------------------------------------------------------------------
reset()1812 RegexMatcher &RegexMatcher::reset() {
1813     fRegionStart    = 0;
1814     fRegionLimit    = fInputLength;
1815     fActiveStart    = 0;
1816     fActiveLimit    = fInputLength;
1817     fAnchorStart    = 0;
1818     fAnchorLimit    = fInputLength;
1819     fLookStart      = 0;
1820     fLookLimit      = fInputLength;
1821     resetPreserveRegion();
1822     return *this;
1823 }
1824 
1825 
1826 
resetPreserveRegion()1827 void RegexMatcher::resetPreserveRegion() {
1828     fMatchStart     = 0;
1829     fMatchEnd       = 0;
1830     fLastMatchEnd   = -1;
1831     fAppendPosition = 0;
1832     fMatch          = FALSE;
1833     fHitEnd         = FALSE;
1834     fRequireEnd     = FALSE;
1835     fTime           = 0;
1836     fTickCounter    = TIMER_INITIAL_VALUE;
1837     //resetStack(); // more expensive than it looks...
1838 }
1839 
1840 
reset(const UnicodeString & input)1841 RegexMatcher &RegexMatcher::reset(const UnicodeString &input) {
1842     fInputText = utext_openConstUnicodeString(fInputText, &input, &fDeferredStatus);
1843     if (fPattern->fNeedsAltInput) {
1844         fAltInputText = utext_clone(fAltInputText, fInputText, FALSE, TRUE, &fDeferredStatus);
1845     }
1846     if (U_FAILURE(fDeferredStatus)) {
1847         return *this;
1848     }
1849     fInputLength = utext_nativeLength(fInputText);
1850 
1851     reset();
1852     delete fInput;
1853     fInput = NULL;
1854 
1855     //  Do the following for any UnicodeString.
1856     //  This is for compatibility for those clients who modify the input string "live" during regex operations.
1857     fInputUniStrMaybeMutable = TRUE;
1858 
1859     if (fWordBreakItr != NULL) {
1860 #if UCONFIG_NO_BREAK_ITERATION==0
1861         UErrorCode status = U_ZERO_ERROR;
1862         fWordBreakItr->setText(fInputText, status);
1863 #endif
1864     }
1865     return *this;
1866 }
1867 
1868 
reset(UText * input)1869 RegexMatcher &RegexMatcher::reset(UText *input) {
1870     if (fInputText != input) {
1871         fInputText = utext_clone(fInputText, input, FALSE, TRUE, &fDeferredStatus);
1872         if (fPattern->fNeedsAltInput) fAltInputText = utext_clone(fAltInputText, fInputText, FALSE, TRUE, &fDeferredStatus);
1873         if (U_FAILURE(fDeferredStatus)) {
1874             return *this;
1875         }
1876         fInputLength = utext_nativeLength(fInputText);
1877 
1878         delete fInput;
1879         fInput = NULL;
1880 
1881         if (fWordBreakItr != NULL) {
1882 #if UCONFIG_NO_BREAK_ITERATION==0
1883             UErrorCode status = U_ZERO_ERROR;
1884             fWordBreakItr->setText(input, status);
1885 #endif
1886         }
1887     }
1888     reset();
1889     fInputUniStrMaybeMutable = FALSE;
1890 
1891     return *this;
1892 }
1893 
1894 /*RegexMatcher &RegexMatcher::reset(const UChar *) {
1895     fDeferredStatus = U_INTERNAL_PROGRAM_ERROR;
1896     return *this;
1897 }*/
1898 
reset(int64_t position,UErrorCode & status)1899 RegexMatcher &RegexMatcher::reset(int64_t position, UErrorCode &status) {
1900     if (U_FAILURE(status)) {
1901         return *this;
1902     }
1903     reset();       // Reset also resets the region to be the entire string.
1904 
1905     if (position < 0 || position > fActiveLimit) {
1906         status = U_INDEX_OUTOFBOUNDS_ERROR;
1907         return *this;
1908     }
1909     fMatchEnd = position;
1910     return *this;
1911 }
1912 
1913 
1914 //--------------------------------------------------------------------------------
1915 //
1916 //    refresh
1917 //
1918 //--------------------------------------------------------------------------------
refreshInputText(UText * input,UErrorCode & status)1919 RegexMatcher &RegexMatcher::refreshInputText(UText *input, UErrorCode &status) {
1920     if (U_FAILURE(status)) {
1921         return *this;
1922     }
1923     if (input == NULL) {
1924         status = U_ILLEGAL_ARGUMENT_ERROR;
1925         return *this;
1926     }
1927     if (utext_nativeLength(fInputText) != utext_nativeLength(input)) {
1928         status = U_ILLEGAL_ARGUMENT_ERROR;
1929         return *this;
1930     }
1931     int64_t  pos = utext_getNativeIndex(fInputText);
1932     //  Shallow read-only clone of the new UText into the existing input UText
1933     fInputText = utext_clone(fInputText, input, FALSE, TRUE, &status);
1934     if (U_FAILURE(status)) {
1935         return *this;
1936     }
1937     utext_setNativeIndex(fInputText, pos);
1938 
1939     if (fAltInputText != NULL) {
1940         pos = utext_getNativeIndex(fAltInputText);
1941         fAltInputText = utext_clone(fAltInputText, input, FALSE, TRUE, &status);
1942         if (U_FAILURE(status)) {
1943             return *this;
1944         }
1945         utext_setNativeIndex(fAltInputText, pos);
1946     }
1947     return *this;
1948 }
1949 
1950 
1951 
1952 //--------------------------------------------------------------------------------
1953 //
1954 //    setTrace
1955 //
1956 //--------------------------------------------------------------------------------
setTrace(UBool state)1957 void RegexMatcher::setTrace(UBool state) {
1958     fTraceDebug = state;
1959 }
1960 
1961 
1962 
1963 /**
1964   *  UText, replace entire contents of the destination UText with a substring of the source UText.
1965   *
1966   *     @param src    The source UText
1967   *     @param dest   The destination UText. Must be writable.
1968   *                   May be NULL, in which case a new UText will be allocated.
1969   *     @param start  Start index of source substring.
1970   *     @param limit  Limit index of source substring.
1971   *     @param status An error code.
1972   */
utext_extract_replace(UText * src,UText * dest,int64_t start,int64_t limit,UErrorCode * status)1973 static UText *utext_extract_replace(UText *src, UText *dest, int64_t start, int64_t limit, UErrorCode *status) {
1974     if (U_FAILURE(*status)) {
1975         return dest;
1976     }
1977     if (start == limit) {
1978         if (dest) {
1979             utext_replace(dest, 0, utext_nativeLength(dest), NULL, 0, status);
1980             return dest;
1981         } else {
1982             return utext_openUChars(NULL, NULL, 0, status);
1983         }
1984     }
1985     int32_t length = utext_extract(src, start, limit, NULL, 0, status);
1986     if (*status != U_BUFFER_OVERFLOW_ERROR && U_FAILURE(*status)) {
1987         return dest;
1988     }
1989     *status = U_ZERO_ERROR;
1990     MaybeStackArray<UChar, 40> buffer;
1991     if (length >= buffer.getCapacity()) {
1992         UChar *newBuf = buffer.resize(length+1);   // Leave space for terminating Nul.
1993         if (newBuf == NULL) {
1994             *status = U_MEMORY_ALLOCATION_ERROR;
1995         }
1996     }
1997     utext_extract(src, start, limit, buffer.getAlias(), length+1, status);
1998     if (dest) {
1999         utext_replace(dest, 0, utext_nativeLength(dest), buffer.getAlias(), length, status);
2000         return dest;
2001     }
2002 
2003     // Caller did not provide a prexisting UText.
2004     // Open a new one, and have it adopt the text buffer storage.
2005     if (U_FAILURE(*status)) {
2006         return NULL;
2007     }
2008     int32_t ownedLength = 0;
2009     UChar *ownedBuf = buffer.orphanOrClone(length+1, ownedLength);
2010     if (ownedBuf == NULL) {
2011         *status = U_MEMORY_ALLOCATION_ERROR;
2012         return NULL;
2013     }
2014     UText *result = utext_openUChars(NULL, ownedBuf, length, status);
2015     if (U_FAILURE(*status)) {
2016         uprv_free(ownedBuf);
2017         return NULL;
2018     }
2019     result->providerProperties |= (1 << UTEXT_PROVIDER_OWNS_TEXT);
2020     return result;
2021 }
2022 
2023 
2024 //---------------------------------------------------------------------
2025 //
2026 //   split
2027 //
2028 //---------------------------------------------------------------------
split(const UnicodeString & input,UnicodeString dest[],int32_t destCapacity,UErrorCode & status)2029 int32_t  RegexMatcher::split(const UnicodeString &input,
2030         UnicodeString    dest[],
2031         int32_t          destCapacity,
2032         UErrorCode      &status)
2033 {
2034     UText inputText = UTEXT_INITIALIZER;
2035     utext_openConstUnicodeString(&inputText, &input, &status);
2036     if (U_FAILURE(status)) {
2037         return 0;
2038     }
2039 
2040     UText **destText = (UText **)uprv_malloc(sizeof(UText*)*destCapacity);
2041     if (destText == NULL) {
2042         status = U_MEMORY_ALLOCATION_ERROR;
2043         return 0;
2044     }
2045     int32_t i;
2046     for (i = 0; i < destCapacity; i++) {
2047         destText[i] = utext_openUnicodeString(NULL, &dest[i], &status);
2048     }
2049 
2050     int32_t fieldCount = split(&inputText, destText, destCapacity, status);
2051 
2052     for (i = 0; i < destCapacity; i++) {
2053         utext_close(destText[i]);
2054     }
2055 
2056     uprv_free(destText);
2057     utext_close(&inputText);
2058     return fieldCount;
2059 }
2060 
2061 //
2062 //   split, UText mode
2063 //
split(UText * input,UText * dest[],int32_t destCapacity,UErrorCode & status)2064 int32_t  RegexMatcher::split(UText *input,
2065         UText           *dest[],
2066         int32_t          destCapacity,
2067         UErrorCode      &status)
2068 {
2069     //
2070     // Check arguements for validity
2071     //
2072     if (U_FAILURE(status)) {
2073         return 0;
2074     };
2075 
2076     if (destCapacity < 1) {
2077         status = U_ILLEGAL_ARGUMENT_ERROR;
2078         return 0;
2079     }
2080 
2081     //
2082     // Reset for the input text
2083     //
2084     reset(input);
2085     int64_t   nextOutputStringStart = 0;
2086     if (fActiveLimit == 0) {
2087         return 0;
2088     }
2089 
2090     //
2091     // Loop through the input text, searching for the delimiter pattern
2092     //
2093     int32_t i;
2094     int32_t numCaptureGroups = fPattern->fGroupMap->size();
2095     for (i=0; ; i++) {
2096         if (i>=destCapacity-1) {
2097             // There is one or zero output string left.
2098             // Fill the last output string with whatever is left from the input, then exit the loop.
2099             //  ( i will be == destCapacity if we filled the output array while processing
2100             //    capture groups of the delimiter expression, in which case we will discard the
2101             //    last capture group saved in favor of the unprocessed remainder of the
2102             //    input string.)
2103             i = destCapacity-1;
2104             if (fActiveLimit > nextOutputStringStart) {
2105                 if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) {
2106                     if (dest[i]) {
2107                         utext_replace(dest[i], 0, utext_nativeLength(dest[i]),
2108                                       input->chunkContents+nextOutputStringStart,
2109                                       (int32_t)(fActiveLimit-nextOutputStringStart), &status);
2110                     } else {
2111                         UText remainingText = UTEXT_INITIALIZER;
2112                         utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart,
2113                                          fActiveLimit-nextOutputStringStart, &status);
2114                         dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2115                         utext_close(&remainingText);
2116                     }
2117                 } else {
2118                     UErrorCode lengthStatus = U_ZERO_ERROR;
2119                     int32_t remaining16Length =
2120                         utext_extract(input, nextOutputStringStart, fActiveLimit, NULL, 0, &lengthStatus);
2121                     UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1));
2122                     if (remainingChars == NULL) {
2123                         status = U_MEMORY_ALLOCATION_ERROR;
2124                         break;
2125                     }
2126 
2127                     utext_extract(input, nextOutputStringStart, fActiveLimit, remainingChars, remaining16Length+1, &status);
2128                     if (dest[i]) {
2129                         utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status);
2130                     } else {
2131                         UText remainingText = UTEXT_INITIALIZER;
2132                         utext_openUChars(&remainingText, remainingChars, remaining16Length, &status);
2133                         dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2134                         utext_close(&remainingText);
2135                     }
2136 
2137                     uprv_free(remainingChars);
2138                 }
2139             }
2140             break;
2141         }
2142         if (find()) {
2143             // We found another delimiter.  Move everything from where we started looking
2144             //  up until the start of the delimiter into the next output string.
2145             if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) {
2146                 if (dest[i]) {
2147                     utext_replace(dest[i], 0, utext_nativeLength(dest[i]),
2148                                   input->chunkContents+nextOutputStringStart,
2149                                   (int32_t)(fMatchStart-nextOutputStringStart), &status);
2150                 } else {
2151                     UText remainingText = UTEXT_INITIALIZER;
2152                     utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart,
2153                                       fMatchStart-nextOutputStringStart, &status);
2154                     dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2155                     utext_close(&remainingText);
2156                 }
2157             } else {
2158                 UErrorCode lengthStatus = U_ZERO_ERROR;
2159                 int32_t remaining16Length = utext_extract(input, nextOutputStringStart, fMatchStart, NULL, 0, &lengthStatus);
2160                 UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1));
2161                 if (remainingChars == NULL) {
2162                     status = U_MEMORY_ALLOCATION_ERROR;
2163                     break;
2164                 }
2165                 utext_extract(input, nextOutputStringStart, fMatchStart, remainingChars, remaining16Length+1, &status);
2166                 if (dest[i]) {
2167                     utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status);
2168                 } else {
2169                     UText remainingText = UTEXT_INITIALIZER;
2170                     utext_openUChars(&remainingText, remainingChars, remaining16Length, &status);
2171                     dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2172                     utext_close(&remainingText);
2173                 }
2174 
2175                 uprv_free(remainingChars);
2176             }
2177             nextOutputStringStart = fMatchEnd;
2178 
2179             // If the delimiter pattern has capturing parentheses, the captured
2180             //  text goes out into the next n destination strings.
2181             int32_t groupNum;
2182             for (groupNum=1; groupNum<=numCaptureGroups; groupNum++) {
2183                 if (i >= destCapacity-2) {
2184                     // Never fill the last available output string with capture group text.
2185                     // It will filled with the last field, the remainder of the
2186                     //  unsplit input text.
2187                     break;
2188                 }
2189                 i++;
2190                 dest[i] = utext_extract_replace(fInputText, dest[i],
2191                                                start64(groupNum, status), end64(groupNum, status), &status);
2192             }
2193 
2194             if (nextOutputStringStart == fActiveLimit) {
2195                 // The delimiter was at the end of the string.  We're done, but first
2196                 // we output one last empty string, for the empty field following
2197                 //   the delimiter at the end of input.
2198                 if (i+1 < destCapacity) {
2199                     ++i;
2200                     if (dest[i] == NULL) {
2201                         dest[i] = utext_openUChars(NULL, NULL, 0, &status);
2202                     } else {
2203                         static const UChar emptyString[] = {(UChar)0};
2204                         utext_replace(dest[i], 0, utext_nativeLength(dest[i]), emptyString, 0, &status);
2205                     }
2206                 }
2207                 break;
2208 
2209             }
2210         }
2211         else
2212         {
2213             // We ran off the end of the input while looking for the next delimiter.
2214             // All the remaining text goes into the current output string.
2215             if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) {
2216                 if (dest[i]) {
2217                     utext_replace(dest[i], 0, utext_nativeLength(dest[i]),
2218                                   input->chunkContents+nextOutputStringStart,
2219                                   (int32_t)(fActiveLimit-nextOutputStringStart), &status);
2220                 } else {
2221                     UText remainingText = UTEXT_INITIALIZER;
2222                     utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart,
2223                                      fActiveLimit-nextOutputStringStart, &status);
2224                     dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2225                     utext_close(&remainingText);
2226                 }
2227             } else {
2228                 UErrorCode lengthStatus = U_ZERO_ERROR;
2229                 int32_t remaining16Length = utext_extract(input, nextOutputStringStart, fActiveLimit, NULL, 0, &lengthStatus);
2230                 UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1));
2231                 if (remainingChars == NULL) {
2232                     status = U_MEMORY_ALLOCATION_ERROR;
2233                     break;
2234                 }
2235 
2236                 utext_extract(input, nextOutputStringStart, fActiveLimit, remainingChars, remaining16Length+1, &status);
2237                 if (dest[i]) {
2238                     utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status);
2239                 } else {
2240                     UText remainingText = UTEXT_INITIALIZER;
2241                     utext_openUChars(&remainingText, remainingChars, remaining16Length, &status);
2242                     dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2243                     utext_close(&remainingText);
2244                 }
2245 
2246                 uprv_free(remainingChars);
2247             }
2248             break;
2249         }
2250         if (U_FAILURE(status)) {
2251             break;
2252         }
2253     }   // end of for loop
2254     return i+1;
2255 }
2256 
2257 
2258 //--------------------------------------------------------------------------------
2259 //
2260 //     start
2261 //
2262 //--------------------------------------------------------------------------------
start(UErrorCode & status) const2263 int32_t RegexMatcher::start(UErrorCode &status) const {
2264     return start(0, status);
2265 }
2266 
start64(UErrorCode & status) const2267 int64_t RegexMatcher::start64(UErrorCode &status) const {
2268     return start64(0, status);
2269 }
2270 
2271 //--------------------------------------------------------------------------------
2272 //
2273 //     start(int32_t group, UErrorCode &status)
2274 //
2275 //--------------------------------------------------------------------------------
2276 
start64(int32_t group,UErrorCode & status) const2277 int64_t RegexMatcher::start64(int32_t group, UErrorCode &status) const {
2278     if (U_FAILURE(status)) {
2279         return -1;
2280     }
2281     if (U_FAILURE(fDeferredStatus)) {
2282         status = fDeferredStatus;
2283         return -1;
2284     }
2285     if (fMatch == FALSE) {
2286         status = U_REGEX_INVALID_STATE;
2287         return -1;
2288     }
2289     if (group < 0 || group > fPattern->fGroupMap->size()) {
2290         status = U_INDEX_OUTOFBOUNDS_ERROR;
2291         return -1;
2292     }
2293     int64_t s;
2294     if (group == 0) {
2295         s = fMatchStart;
2296     } else {
2297         int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1);
2298         U_ASSERT(groupOffset < fPattern->fFrameSize);
2299         U_ASSERT(groupOffset >= 0);
2300         s = fFrame->fExtra[groupOffset];
2301     }
2302 
2303     return s;
2304 }
2305 
2306 
start(int32_t group,UErrorCode & status) const2307 int32_t RegexMatcher::start(int32_t group, UErrorCode &status) const {
2308     return (int32_t)start64(group, status);
2309 }
2310 
2311 //--------------------------------------------------------------------------------
2312 //
2313 //     useAnchoringBounds
2314 //
2315 //--------------------------------------------------------------------------------
useAnchoringBounds(UBool b)2316 RegexMatcher &RegexMatcher::useAnchoringBounds(UBool b) {
2317     fAnchoringBounds = b;
2318     fAnchorStart = (fAnchoringBounds ? fRegionStart : 0);
2319     fAnchorLimit = (fAnchoringBounds ? fRegionLimit : fInputLength);
2320     return *this;
2321 }
2322 
2323 
2324 //--------------------------------------------------------------------------------
2325 //
2326 //     useTransparentBounds
2327 //
2328 //--------------------------------------------------------------------------------
useTransparentBounds(UBool b)2329 RegexMatcher &RegexMatcher::useTransparentBounds(UBool b) {
2330     fTransparentBounds = b;
2331     fLookStart = (fTransparentBounds ? 0 : fRegionStart);
2332     fLookLimit = (fTransparentBounds ? fInputLength : fRegionLimit);
2333     return *this;
2334 }
2335 
2336 //--------------------------------------------------------------------------------
2337 //
2338 //     setTimeLimit
2339 //
2340 //--------------------------------------------------------------------------------
setTimeLimit(int32_t limit,UErrorCode & status)2341 void RegexMatcher::setTimeLimit(int32_t limit, UErrorCode &status) {
2342     if (U_FAILURE(status)) {
2343         return;
2344     }
2345     if (U_FAILURE(fDeferredStatus)) {
2346         status = fDeferredStatus;
2347         return;
2348     }
2349     if (limit < 0) {
2350         status = U_ILLEGAL_ARGUMENT_ERROR;
2351         return;
2352     }
2353     fTimeLimit = limit;
2354 }
2355 
2356 
2357 //--------------------------------------------------------------------------------
2358 //
2359 //     getTimeLimit
2360 //
2361 //--------------------------------------------------------------------------------
getTimeLimit() const2362 int32_t RegexMatcher::getTimeLimit() const {
2363     return fTimeLimit;
2364 }
2365 
2366 
2367 //--------------------------------------------------------------------------------
2368 //
2369 //     setStackLimit
2370 //
2371 //--------------------------------------------------------------------------------
setStackLimit(int32_t limit,UErrorCode & status)2372 void RegexMatcher::setStackLimit(int32_t limit, UErrorCode &status) {
2373     if (U_FAILURE(status)) {
2374         return;
2375     }
2376     if (U_FAILURE(fDeferredStatus)) {
2377         status = fDeferredStatus;
2378         return;
2379     }
2380     if (limit < 0) {
2381         status = U_ILLEGAL_ARGUMENT_ERROR;
2382         return;
2383     }
2384 
2385     // Reset the matcher.  This is needed here in case there is a current match
2386     //    whose final stack frame (containing the match results, pointed to by fFrame)
2387     //    would be lost by resizing to a smaller stack size.
2388     reset();
2389 
2390     if (limit == 0) {
2391         // Unlimited stack expansion
2392         fStack->setMaxCapacity(0);
2393     } else {
2394         // Change the units of the limit  from bytes to ints, and bump the size up
2395         //   to be big enough to hold at least one stack frame for the pattern,
2396         //   if it isn't there already.
2397         int32_t adjustedLimit = limit / sizeof(int32_t);
2398         if (adjustedLimit < fPattern->fFrameSize) {
2399             adjustedLimit = fPattern->fFrameSize;
2400         }
2401         fStack->setMaxCapacity(adjustedLimit);
2402     }
2403     fStackLimit = limit;
2404 }
2405 
2406 
2407 //--------------------------------------------------------------------------------
2408 //
2409 //     getStackLimit
2410 //
2411 //--------------------------------------------------------------------------------
getStackLimit() const2412 int32_t RegexMatcher::getStackLimit() const {
2413     return fStackLimit;
2414 }
2415 
2416 
2417 //--------------------------------------------------------------------------------
2418 //
2419 //     setMatchCallback
2420 //
2421 //--------------------------------------------------------------------------------
setMatchCallback(URegexMatchCallback * callback,const void * context,UErrorCode & status)2422 void RegexMatcher::setMatchCallback(URegexMatchCallback     *callback,
2423                                     const void              *context,
2424                                     UErrorCode              &status) {
2425     if (U_FAILURE(status)) {
2426         return;
2427     }
2428     fCallbackFn = callback;
2429     fCallbackContext = context;
2430 }
2431 
2432 
2433 //--------------------------------------------------------------------------------
2434 //
2435 //     getMatchCallback
2436 //
2437 //--------------------------------------------------------------------------------
getMatchCallback(URegexMatchCallback * & callback,const void * & context,UErrorCode & status)2438 void RegexMatcher::getMatchCallback(URegexMatchCallback   *&callback,
2439                                   const void              *&context,
2440                                   UErrorCode              &status) {
2441     if (U_FAILURE(status)) {
2442        return;
2443     }
2444     callback = fCallbackFn;
2445     context  = fCallbackContext;
2446 }
2447 
2448 
2449 //--------------------------------------------------------------------------------
2450 //
2451 //     setMatchCallback
2452 //
2453 //--------------------------------------------------------------------------------
setFindProgressCallback(URegexFindProgressCallback * callback,const void * context,UErrorCode & status)2454 void RegexMatcher::setFindProgressCallback(URegexFindProgressCallback      *callback,
2455                                                 const void                      *context,
2456                                                 UErrorCode                      &status) {
2457     if (U_FAILURE(status)) {
2458         return;
2459     }
2460     fFindProgressCallbackFn = callback;
2461     fFindProgressCallbackContext = context;
2462 }
2463 
2464 
2465 //--------------------------------------------------------------------------------
2466 //
2467 //     getMatchCallback
2468 //
2469 //--------------------------------------------------------------------------------
getFindProgressCallback(URegexFindProgressCallback * & callback,const void * & context,UErrorCode & status)2470 void RegexMatcher::getFindProgressCallback(URegexFindProgressCallback    *&callback,
2471                                                 const void                    *&context,
2472                                                 UErrorCode                    &status) {
2473     if (U_FAILURE(status)) {
2474        return;
2475     }
2476     callback = fFindProgressCallbackFn;
2477     context  = fFindProgressCallbackContext;
2478 }
2479 
2480 
2481 //================================================================================
2482 //
2483 //    Code following this point in this file is the internal
2484 //    Match Engine Implementation.
2485 //
2486 //================================================================================
2487 
2488 
2489 //--------------------------------------------------------------------------------
2490 //
2491 //   resetStack
2492 //           Discard any previous contents of the state save stack, and initialize a
2493 //           new stack frame to all -1.  The -1s are needed for capture group limits,
2494 //           where they indicate that a group has not yet matched anything.
2495 //--------------------------------------------------------------------------------
resetStack()2496 REStackFrame *RegexMatcher::resetStack() {
2497     // Discard any previous contents of the state save stack, and initialize a
2498     //  new stack frame with all -1 data.  The -1s are needed for capture group limits,
2499     //  where they indicate that a group has not yet matched anything.
2500     fStack->removeAllElements();
2501 
2502     REStackFrame *iFrame = (REStackFrame *)fStack->reserveBlock(fPattern->fFrameSize, fDeferredStatus);
2503     if(U_FAILURE(fDeferredStatus)) {
2504         return NULL;
2505     }
2506 
2507     int32_t i;
2508     for (i=0; i<fPattern->fFrameSize-RESTACKFRAME_HDRCOUNT; i++) {
2509         iFrame->fExtra[i] = -1;
2510     }
2511     return iFrame;
2512 }
2513 
2514 
2515 
2516 //--------------------------------------------------------------------------------
2517 //
2518 //   isWordBoundary
2519 //                     in perl, "xab..cd..", \b is true at positions 0,3,5,7
2520 //                     For us,
2521 //                       If the current char is a combining mark,
2522 //                          \b is FALSE.
2523 //                       Else Scan backwards to the first non-combining char.
2524 //                            We are at a boundary if the this char and the original chars are
2525 //                               opposite in membership in \w set
2526 //
2527 //          parameters:   pos   - the current position in the input buffer
2528 //
2529 //              TODO:  double-check edge cases at region boundaries.
2530 //
2531 //--------------------------------------------------------------------------------
isWordBoundary(int64_t pos)2532 UBool RegexMatcher::isWordBoundary(int64_t pos) {
2533     UBool isBoundary = FALSE;
2534     UBool cIsWord    = FALSE;
2535 
2536     if (pos >= fLookLimit) {
2537         fHitEnd = TRUE;
2538     } else {
2539         // Determine whether char c at current position is a member of the word set of chars.
2540         // If we're off the end of the string, behave as though we're not at a word char.
2541         UTEXT_SETNATIVEINDEX(fInputText, pos);
2542         UChar32  c = UTEXT_CURRENT32(fInputText);
2543         if (u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND) || u_charType(c) == U_FORMAT_CHAR) {
2544             // Current char is a combining one.  Not a boundary.
2545             return FALSE;
2546         }
2547         cIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(c);
2548     }
2549 
2550     // Back up until we come to a non-combining char, determine whether
2551     //  that char is a word char.
2552     UBool prevCIsWord = FALSE;
2553     for (;;) {
2554         if (UTEXT_GETNATIVEINDEX(fInputText) <= fLookStart) {
2555             break;
2556         }
2557         UChar32 prevChar = UTEXT_PREVIOUS32(fInputText);
2558         if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND)
2559               || u_charType(prevChar) == U_FORMAT_CHAR)) {
2560             prevCIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(prevChar);
2561             break;
2562         }
2563     }
2564     isBoundary = cIsWord ^ prevCIsWord;
2565     return isBoundary;
2566 }
2567 
isChunkWordBoundary(int32_t pos)2568 UBool RegexMatcher::isChunkWordBoundary(int32_t pos) {
2569     UBool isBoundary = FALSE;
2570     UBool cIsWord    = FALSE;
2571 
2572     const UChar *inputBuf = fInputText->chunkContents;
2573 
2574     if (pos >= fLookLimit) {
2575         fHitEnd = TRUE;
2576     } else {
2577         // Determine whether char c at current position is a member of the word set of chars.
2578         // If we're off the end of the string, behave as though we're not at a word char.
2579         UChar32 c;
2580         U16_GET(inputBuf, fLookStart, pos, fLookLimit, c);
2581         if (u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND) || u_charType(c) == U_FORMAT_CHAR) {
2582             // Current char is a combining one.  Not a boundary.
2583             return FALSE;
2584         }
2585         cIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(c);
2586     }
2587 
2588     // Back up until we come to a non-combining char, determine whether
2589     //  that char is a word char.
2590     UBool prevCIsWord = FALSE;
2591     for (;;) {
2592         if (pos <= fLookStart) {
2593             break;
2594         }
2595         UChar32 prevChar;
2596         U16_PREV(inputBuf, fLookStart, pos, prevChar);
2597         if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND)
2598               || u_charType(prevChar) == U_FORMAT_CHAR)) {
2599             prevCIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(prevChar);
2600             break;
2601         }
2602     }
2603     isBoundary = cIsWord ^ prevCIsWord;
2604     return isBoundary;
2605 }
2606 
2607 //--------------------------------------------------------------------------------
2608 //
2609 //   isUWordBoundary
2610 //
2611 //         Test for a word boundary using RBBI word break.
2612 //
2613 //          parameters:   pos   - the current position in the input buffer
2614 //
2615 //--------------------------------------------------------------------------------
isUWordBoundary(int64_t pos)2616 UBool RegexMatcher::isUWordBoundary(int64_t pos) {
2617     UBool       returnVal = FALSE;
2618 #if UCONFIG_NO_BREAK_ITERATION==0
2619 
2620     // If we haven't yet created a break iterator for this matcher, do it now.
2621     if (fWordBreakItr == NULL) {
2622         fWordBreakItr =
2623             (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), fDeferredStatus);
2624         if (U_FAILURE(fDeferredStatus)) {
2625             return FALSE;
2626         }
2627         fWordBreakItr->setText(fInputText, fDeferredStatus);
2628     }
2629 
2630     if (pos >= fLookLimit) {
2631         fHitEnd = TRUE;
2632         returnVal = TRUE;   // With Unicode word rules, only positions within the interior of "real"
2633                             //    words are not boundaries.  All non-word chars stand by themselves,
2634                             //    with word boundaries on both sides.
2635     } else {
2636         if (!UTEXT_USES_U16(fInputText)) {
2637             // !!!: Would like a better way to do this!
2638             UErrorCode status = U_ZERO_ERROR;
2639             pos = utext_extract(fInputText, 0, pos, NULL, 0, &status);
2640         }
2641         returnVal = fWordBreakItr->isBoundary((int32_t)pos);
2642     }
2643 #endif
2644     return   returnVal;
2645 }
2646 
2647 //--------------------------------------------------------------------------------
2648 //
2649 //   IncrementTime     This function is called once each TIMER_INITIAL_VALUE state
2650 //                     saves. Increment the "time" counter, and call the
2651 //                     user callback function if there is one installed.
2652 //
2653 //                     If the match operation needs to be aborted, either for a time-out
2654 //                     or because the user callback asked for it, just set an error status.
2655 //                     The engine will pick that up and stop in its outer loop.
2656 //
2657 //--------------------------------------------------------------------------------
IncrementTime(UErrorCode & status)2658 void RegexMatcher::IncrementTime(UErrorCode &status) {
2659     fTickCounter = TIMER_INITIAL_VALUE;
2660     fTime++;
2661     if (fCallbackFn != NULL) {
2662         if ((*fCallbackFn)(fCallbackContext, fTime) == FALSE) {
2663             status = U_REGEX_STOPPED_BY_CALLER;
2664             return;
2665         }
2666     }
2667     if (fTimeLimit > 0 && fTime >= fTimeLimit) {
2668         status = U_REGEX_TIME_OUT;
2669     }
2670 }
2671 
2672 //--------------------------------------------------------------------------------
2673 //
2674 //   StateSave
2675 //       Make a new stack frame, initialized as a copy of the current stack frame.
2676 //       Set the pattern index in the original stack frame from the operand value
2677 //       in the opcode.  Execution of the engine continues with the state in
2678 //       the newly created stack frame
2679 //
2680 //       Note that reserveBlock() may grow the stack, resulting in the
2681 //       whole thing being relocated in memory.
2682 //
2683 //    Parameters:
2684 //       fp           The top frame pointer when called.  At return, a new
2685 //                    fame will be present
2686 //       savePatIdx   An index into the compiled pattern.  Goes into the original
2687 //                    (not new) frame.  If execution ever back-tracks out of the
2688 //                    new frame, this will be where we continue from in the pattern.
2689 //    Return
2690 //                    The new frame pointer.
2691 //
2692 //--------------------------------------------------------------------------------
StateSave(REStackFrame * fp,int64_t savePatIdx,UErrorCode & status)2693 inline REStackFrame *RegexMatcher::StateSave(REStackFrame *fp, int64_t savePatIdx, UErrorCode &status) {
2694     if (U_FAILURE(status)) {
2695         return fp;
2696     }
2697     // push storage for a new frame.
2698     int64_t *newFP = fStack->reserveBlock(fFrameSize, status);
2699     if (U_FAILURE(status)) {
2700         // Failure on attempted stack expansion.
2701         //   Stack function set some other error code, change it to a more
2702         //   specific one for regular expressions.
2703         status = U_REGEX_STACK_OVERFLOW;
2704         // We need to return a writable stack frame, so just return the
2705         //    previous frame.  The match operation will stop quickly
2706         //    because of the error status, after which the frame will never
2707         //    be looked at again.
2708         return fp;
2709     }
2710     fp = (REStackFrame *)(newFP - fFrameSize);  // in case of realloc of stack.
2711 
2712     // New stack frame = copy of old top frame.
2713     int64_t *source = (int64_t *)fp;
2714     int64_t *dest   = newFP;
2715     for (;;) {
2716         *dest++ = *source++;
2717         if (source == newFP) {
2718             break;
2719         }
2720     }
2721 
2722     fTickCounter--;
2723     if (fTickCounter <= 0) {
2724        IncrementTime(status);    // Re-initializes fTickCounter
2725     }
2726     fp->fPatIdx = savePatIdx;
2727     return (REStackFrame *)newFP;
2728 }
2729 
2730 #if defined(REGEX_DEBUG)
2731 namespace {
StringFromUText(UText * ut)2732 UnicodeString StringFromUText(UText *ut) {
2733     UnicodeString result;
2734     for (UChar32 c = utext_next32From(ut, 0); c != U_SENTINEL; c = UTEXT_NEXT32(ut)) {
2735         result.append(c);
2736     }
2737     return result;
2738 }
2739 }
2740 #endif // REGEX_DEBUG
2741 
2742 
2743 //--------------------------------------------------------------------------------
2744 //
2745 //   MatchAt      This is the actual matching engine.
2746 //
2747 //                  startIdx:    begin matching a this index.
2748 //                  toEnd:       if true, match must extend to end of the input region
2749 //
2750 //--------------------------------------------------------------------------------
MatchAt(int64_t startIdx,UBool toEnd,UErrorCode & status)2751 void RegexMatcher::MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status) {
2752     UBool       isMatch  = FALSE;      // True if the we have a match.
2753 
2754     int64_t     backSearchIndex = U_INT64_MAX; // used after greedy single-character matches for searching backwards
2755 
2756     int32_t     op;                    // Operation from the compiled pattern, split into
2757     int32_t     opType;                //    the opcode
2758     int32_t     opValue;               //    and the operand value.
2759 
2760 #ifdef REGEX_RUN_DEBUG
2761     if (fTraceDebug) {
2762         printf("MatchAt(startIdx=%ld)\n", startIdx);
2763         printf("Original Pattern: \"%s\"\n", CStr(StringFromUText(fPattern->fPattern))());
2764         printf("Input String:     \"%s\"\n\n", CStr(StringFromUText(fInputText))());
2765     }
2766 #endif
2767 
2768     if (U_FAILURE(status)) {
2769         return;
2770     }
2771 
2772     //  Cache frequently referenced items from the compiled pattern
2773     //
2774     int64_t             *pat           = fPattern->fCompiledPat->getBuffer();
2775 
2776     const UChar         *litText       = fPattern->fLiteralText.getBuffer();
2777     UVector             *fSets         = fPattern->fSets;
2778 
2779     fFrameSize = fPattern->fFrameSize;
2780     REStackFrame        *fp            = resetStack();
2781     if (U_FAILURE(fDeferredStatus)) {
2782         status = fDeferredStatus;
2783         return;
2784     }
2785 
2786     fp->fPatIdx   = 0;
2787     fp->fInputIdx = startIdx;
2788 
2789     // Zero out the pattern's static data
2790     int32_t i;
2791     for (i = 0; i<fPattern->fDataSize; i++) {
2792         fData[i] = 0;
2793     }
2794 
2795     //
2796     //  Main loop for interpreting the compiled pattern.
2797     //  One iteration of the loop per pattern operation performed.
2798     //
2799     for (;;) {
2800         op      = (int32_t)pat[fp->fPatIdx];
2801         opType  = URX_TYPE(op);
2802         opValue = URX_VAL(op);
2803 #ifdef REGEX_RUN_DEBUG
2804         if (fTraceDebug) {
2805             UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2806             printf("inputIdx=%ld   inputChar=%x   sp=%3ld   activeLimit=%ld  ", fp->fInputIdx,
2807                 UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(), fActiveLimit);
2808             fPattern->dumpOp(fp->fPatIdx);
2809         }
2810 #endif
2811         fp->fPatIdx++;
2812 
2813         switch (opType) {
2814 
2815 
2816         case URX_NOP:
2817             break;
2818 
2819 
2820         case URX_BACKTRACK:
2821             // Force a backtrack.  In some circumstances, the pattern compiler
2822             //   will notice that the pattern can't possibly match anything, and will
2823             //   emit one of these at that point.
2824             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2825             break;
2826 
2827 
2828         case URX_ONECHAR:
2829             if (fp->fInputIdx < fActiveLimit) {
2830                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2831                 UChar32 c = UTEXT_NEXT32(fInputText);
2832                 if (c == opValue) {
2833                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
2834                     break;
2835                 }
2836             } else {
2837                 fHitEnd = TRUE;
2838             }
2839             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2840             break;
2841 
2842 
2843         case URX_STRING:
2844             {
2845                 // Test input against a literal string.
2846                 // Strings require two slots in the compiled pattern, one for the
2847                 //   offset to the string text, and one for the length.
2848 
2849                 int32_t   stringStartIdx = opValue;
2850                 op      = (int32_t)pat[fp->fPatIdx];     // Fetch the second operand
2851                 fp->fPatIdx++;
2852                 opType    = URX_TYPE(op);
2853                 int32_t stringLen = URX_VAL(op);
2854                 U_ASSERT(opType == URX_STRING_LEN);
2855                 U_ASSERT(stringLen >= 2);
2856 
2857                 const UChar *patternString = litText+stringStartIdx;
2858                 int32_t patternStringIndex = 0;
2859                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2860                 UChar32 inputChar;
2861                 UChar32 patternChar;
2862                 UBool success = TRUE;
2863                 while (patternStringIndex < stringLen) {
2864                     if (UTEXT_GETNATIVEINDEX(fInputText) >= fActiveLimit) {
2865                         success = FALSE;
2866                         fHitEnd = TRUE;
2867                         break;
2868                     }
2869                     inputChar = UTEXT_NEXT32(fInputText);
2870                     U16_NEXT(patternString, patternStringIndex, stringLen, patternChar);
2871                     if (patternChar != inputChar) {
2872                         success = FALSE;
2873                         break;
2874                     }
2875                 }
2876 
2877                 if (success) {
2878                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
2879                 } else {
2880                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2881                 }
2882             }
2883             break;
2884 
2885 
2886         case URX_STATE_SAVE:
2887             fp = StateSave(fp, opValue, status);
2888             break;
2889 
2890 
2891         case URX_END:
2892             // The match loop will exit via this path on a successful match,
2893             //   when we reach the end of the pattern.
2894             if (toEnd && fp->fInputIdx != fActiveLimit) {
2895                 // The pattern matched, but not to the end of input.  Try some more.
2896                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2897                 break;
2898             }
2899             isMatch = TRUE;
2900             goto  breakFromLoop;
2901 
2902         // Start and End Capture stack frame variables are laid out out like this:
2903             //  fp->fExtra[opValue]  - The start of a completed capture group
2904             //             opValue+1 - The end   of a completed capture group
2905             //             opValue+2 - the start of a capture group whose end
2906             //                          has not yet been reached (and might not ever be).
2907         case URX_START_CAPTURE:
2908             U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
2909             fp->fExtra[opValue+2] = fp->fInputIdx;
2910             break;
2911 
2912 
2913         case URX_END_CAPTURE:
2914             U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
2915             U_ASSERT(fp->fExtra[opValue+2] >= 0);            // Start pos for this group must be set.
2916             fp->fExtra[opValue]   = fp->fExtra[opValue+2];   // Tentative start becomes real.
2917             fp->fExtra[opValue+1] = fp->fInputIdx;           // End position
2918             U_ASSERT(fp->fExtra[opValue] <= fp->fExtra[opValue+1]);
2919             break;
2920 
2921 
2922         case URX_DOLLAR:                   //  $, test for End of line
2923                                            //     or for position before new line at end of input
2924             {
2925                 if (fp->fInputIdx >= fAnchorLimit) {
2926                     // We really are at the end of input.  Success.
2927                     fHitEnd = TRUE;
2928                     fRequireEnd = TRUE;
2929                     break;
2930                 }
2931 
2932                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2933 
2934                 // If we are positioned just before a new-line that is located at the
2935                 //   end of input, succeed.
2936                 UChar32 c = UTEXT_NEXT32(fInputText);
2937                 if (UTEXT_GETNATIVEINDEX(fInputText) >= fAnchorLimit) {
2938                     if (isLineTerminator(c)) {
2939                         // If not in the middle of a CR/LF sequence
2940                         if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && ((void)UTEXT_PREVIOUS32(fInputText), UTEXT_PREVIOUS32(fInputText))==0x0d)) {
2941                             // At new-line at end of input. Success
2942                             fHitEnd = TRUE;
2943                             fRequireEnd = TRUE;
2944 
2945                             break;
2946                         }
2947                     }
2948                 } else {
2949                     UChar32 nextC = UTEXT_NEXT32(fInputText);
2950                     if (c == 0x0d && nextC == 0x0a && UTEXT_GETNATIVEINDEX(fInputText) >= fAnchorLimit) {
2951                         fHitEnd = TRUE;
2952                         fRequireEnd = TRUE;
2953                         break;                         // At CR/LF at end of input.  Success
2954                     }
2955                 }
2956 
2957                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2958             }
2959             break;
2960 
2961 
2962          case URX_DOLLAR_D:                   //  $, test for End of Line, in UNIX_LINES mode.
2963             if (fp->fInputIdx >= fAnchorLimit) {
2964                 // Off the end of input.  Success.
2965                 fHitEnd = TRUE;
2966                 fRequireEnd = TRUE;
2967                 break;
2968             } else {
2969                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2970                 UChar32 c = UTEXT_NEXT32(fInputText);
2971                 // Either at the last character of input, or off the end.
2972                 if (c == 0x0a && UTEXT_GETNATIVEINDEX(fInputText) == fAnchorLimit) {
2973                     fHitEnd = TRUE;
2974                     fRequireEnd = TRUE;
2975                     break;
2976                 }
2977             }
2978 
2979             // Not at end of input.  Back-track out.
2980             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2981             break;
2982 
2983 
2984          case URX_DOLLAR_M:                //  $, test for End of line in multi-line mode
2985              {
2986                  if (fp->fInputIdx >= fAnchorLimit) {
2987                      // We really are at the end of input.  Success.
2988                      fHitEnd = TRUE;
2989                      fRequireEnd = TRUE;
2990                      break;
2991                  }
2992                  // If we are positioned just before a new-line, succeed.
2993                  // It makes no difference where the new-line is within the input.
2994                  UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2995                  UChar32 c = UTEXT_CURRENT32(fInputText);
2996                  if (isLineTerminator(c)) {
2997                      // At a line end, except for the odd chance of  being in the middle of a CR/LF sequence
2998                      //  In multi-line mode, hitting a new-line just before the end of input does not
2999                      //   set the hitEnd or requireEnd flags
3000                      if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && UTEXT_PREVIOUS32(fInputText)==0x0d)) {
3001                         break;
3002                      }
3003                  }
3004                  // not at a new line.  Fail.
3005                  fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3006              }
3007              break;
3008 
3009 
3010          case URX_DOLLAR_MD:                //  $, test for End of line in multi-line and UNIX_LINES mode
3011              {
3012                  if (fp->fInputIdx >= fAnchorLimit) {
3013                      // We really are at the end of input.  Success.
3014                      fHitEnd = TRUE;
3015                      fRequireEnd = TRUE;  // Java set requireEnd in this case, even though
3016                      break;               //   adding a new-line would not lose the match.
3017                  }
3018                  // If we are not positioned just before a new-line, the test fails; backtrack out.
3019                  // It makes no difference where the new-line is within the input.
3020                  UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3021                  if (UTEXT_CURRENT32(fInputText) != 0x0a) {
3022                      fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3023                  }
3024              }
3025              break;
3026 
3027 
3028        case URX_CARET:                    //  ^, test for start of line
3029             if (fp->fInputIdx != fAnchorStart) {
3030                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3031             }
3032             break;
3033 
3034 
3035        case URX_CARET_M:                   //  ^, test for start of line in mulit-line mode
3036            {
3037                if (fp->fInputIdx == fAnchorStart) {
3038                    // We are at the start input.  Success.
3039                    break;
3040                }
3041                // Check whether character just before the current pos is a new-line
3042                //   unless we are at the end of input
3043                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3044                UChar32  c = UTEXT_PREVIOUS32(fInputText);
3045                if ((fp->fInputIdx < fAnchorLimit) && isLineTerminator(c)) {
3046                    //  It's a new-line.  ^ is true.  Success.
3047                    //  TODO:  what should be done with positions between a CR and LF?
3048                    break;
3049                }
3050                // Not at the start of a line.  Fail.
3051                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3052            }
3053            break;
3054 
3055 
3056        case URX_CARET_M_UNIX:       //  ^, test for start of line in mulit-line + Unix-line mode
3057            {
3058                U_ASSERT(fp->fInputIdx >= fAnchorStart);
3059                if (fp->fInputIdx <= fAnchorStart) {
3060                    // We are at the start input.  Success.
3061                    break;
3062                }
3063                // Check whether character just before the current pos is a new-line
3064                U_ASSERT(fp->fInputIdx <= fAnchorLimit);
3065                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3066                UChar32  c = UTEXT_PREVIOUS32(fInputText);
3067                if (c != 0x0a) {
3068                    // Not at the start of a line.  Back-track out.
3069                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3070                }
3071            }
3072            break;
3073 
3074         case URX_BACKSLASH_B:          // Test for word boundaries
3075             {
3076                 UBool success = isWordBoundary(fp->fInputIdx);
3077                 success ^= (UBool)(opValue != 0);     // flip sense for \B
3078                 if (!success) {
3079                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3080                 }
3081             }
3082             break;
3083 
3084 
3085         case URX_BACKSLASH_BU:          // Test for word boundaries, Unicode-style
3086             {
3087                 UBool success = isUWordBoundary(fp->fInputIdx);
3088                 success ^= (UBool)(opValue != 0);     // flip sense for \B
3089                 if (!success) {
3090                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3091                 }
3092             }
3093             break;
3094 
3095 
3096         case URX_BACKSLASH_D:            // Test for decimal digit
3097             {
3098                 if (fp->fInputIdx >= fActiveLimit) {
3099                     fHitEnd = TRUE;
3100                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3101                     break;
3102                 }
3103 
3104                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3105 
3106                 UChar32 c = UTEXT_NEXT32(fInputText);
3107                 int8_t ctype = u_charType(c);     // TODO:  make a unicode set for this.  Will be faster.
3108                 UBool success = (ctype == U_DECIMAL_DIGIT_NUMBER);
3109                 success ^= (UBool)(opValue != 0);        // flip sense for \D
3110                 if (success) {
3111                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3112                 } else {
3113                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3114                 }
3115             }
3116             break;
3117 
3118 
3119         case URX_BACKSLASH_G:          // Test for position at end of previous match
3120             if (!((fMatch && fp->fInputIdx==fMatchEnd) || (fMatch==FALSE && fp->fInputIdx==fActiveStart))) {
3121                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3122             }
3123             break;
3124 
3125 
3126         case URX_BACKSLASH_H:            // Test for \h, horizontal white space.
3127             {
3128                 if (fp->fInputIdx >= fActiveLimit) {
3129                     fHitEnd = TRUE;
3130                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3131                     break;
3132                 }
3133                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3134                 UChar32 c = UTEXT_NEXT32(fInputText);
3135                 int8_t ctype = u_charType(c);
3136                 UBool success = (ctype == U_SPACE_SEPARATOR || c == 9);  // SPACE_SEPARATOR || TAB
3137                 success ^= (UBool)(opValue != 0);        // flip sense for \H
3138                 if (success) {
3139                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3140                 } else {
3141                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3142                 }
3143             }
3144             break;
3145 
3146 
3147         case URX_BACKSLASH_R:            // Test for \R, any line break sequence.
3148             {
3149                 if (fp->fInputIdx >= fActiveLimit) {
3150                     fHitEnd = TRUE;
3151                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3152                     break;
3153                 }
3154                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3155                 UChar32 c = UTEXT_NEXT32(fInputText);
3156                 if (isLineTerminator(c)) {
3157                     if (c == 0x0d && utext_current32(fInputText) == 0x0a) {
3158                         utext_next32(fInputText);
3159                     }
3160                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3161                 } else {
3162                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3163                 }
3164             }
3165             break;
3166 
3167 
3168         case URX_BACKSLASH_V:            // \v, any single line ending character.
3169             {
3170                 if (fp->fInputIdx >= fActiveLimit) {
3171                     fHitEnd = TRUE;
3172                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3173                     break;
3174                 }
3175                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3176                 UChar32 c = UTEXT_NEXT32(fInputText);
3177                 UBool success = isLineTerminator(c);
3178                 success ^= (UBool)(opValue != 0);        // flip sense for \V
3179                 if (success) {
3180                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3181                 } else {
3182                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3183                 }
3184             }
3185             break;
3186 
3187 
3188         case URX_BACKSLASH_X:
3189             //  Match a Grapheme, as defined by Unicode TR 29.
3190             //  Differs slightly from Perl, which consumes combining marks independently
3191             //    of context.
3192             {
3193 
3194                 // Fail if at end of input
3195                 if (fp->fInputIdx >= fActiveLimit) {
3196                     fHitEnd = TRUE;
3197                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3198                     break;
3199                 }
3200 
3201                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3202 
3203                 // Examine (and consume) the current char.
3204                 //   Dispatch into a little state machine, based on the char.
3205                 UChar32  c;
3206                 c = UTEXT_NEXT32(fInputText);
3207                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3208                 UnicodeSet **sets = fPattern->fStaticSets;
3209                 if (sets[URX_GC_NORMAL]->contains(c))  goto GC_Extend;
3210                 if (sets[URX_GC_CONTROL]->contains(c)) goto GC_Control;
3211                 if (sets[URX_GC_L]->contains(c))       goto GC_L;
3212                 if (sets[URX_GC_LV]->contains(c))      goto GC_V;
3213                 if (sets[URX_GC_LVT]->contains(c))     goto GC_T;
3214                 if (sets[URX_GC_V]->contains(c))       goto GC_V;
3215                 if (sets[URX_GC_T]->contains(c))       goto GC_T;
3216                 goto GC_Extend;
3217 
3218 
3219 
3220 GC_L:
3221                 if (fp->fInputIdx >= fActiveLimit)         goto GC_Done;
3222                 c = UTEXT_NEXT32(fInputText);
3223                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3224                 if (sets[URX_GC_L]->contains(c))       goto GC_L;
3225                 if (sets[URX_GC_LV]->contains(c))      goto GC_V;
3226                 if (sets[URX_GC_LVT]->contains(c))     goto GC_T;
3227                 if (sets[URX_GC_V]->contains(c))       goto GC_V;
3228                 (void)UTEXT_PREVIOUS32(fInputText);
3229                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3230                 goto GC_Extend;
3231 
3232 GC_V:
3233                 if (fp->fInputIdx >= fActiveLimit)         goto GC_Done;
3234                 c = UTEXT_NEXT32(fInputText);
3235                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3236                 if (sets[URX_GC_V]->contains(c))       goto GC_V;
3237                 if (sets[URX_GC_T]->contains(c))       goto GC_T;
3238                 (void)UTEXT_PREVIOUS32(fInputText);
3239                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3240                 goto GC_Extend;
3241 
3242 GC_T:
3243                 if (fp->fInputIdx >= fActiveLimit)         goto GC_Done;
3244                 c = UTEXT_NEXT32(fInputText);
3245                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3246                 if (sets[URX_GC_T]->contains(c))       goto GC_T;
3247                 (void)UTEXT_PREVIOUS32(fInputText);
3248                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3249                 goto GC_Extend;
3250 
3251 GC_Extend:
3252                 // Combining characters are consumed here
3253                 for (;;) {
3254                     if (fp->fInputIdx >= fActiveLimit) {
3255                         break;
3256                     }
3257                     c = UTEXT_CURRENT32(fInputText);
3258                     if (sets[URX_GC_EXTEND]->contains(c) == FALSE) {
3259                         break;
3260                     }
3261                     (void)UTEXT_NEXT32(fInputText);
3262                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3263                 }
3264                 goto GC_Done;
3265 
3266 GC_Control:
3267                 // Most control chars stand alone (don't combine with combining chars),
3268                 //   except for that CR/LF sequence is a single grapheme cluster.
3269                 if (c == 0x0d && fp->fInputIdx < fActiveLimit && UTEXT_CURRENT32(fInputText) == 0x0a) {
3270                     c = UTEXT_NEXT32(fInputText);
3271                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3272                 }
3273 
3274 GC_Done:
3275                 if (fp->fInputIdx >= fActiveLimit) {
3276                     fHitEnd = TRUE;
3277                 }
3278                 break;
3279             }
3280 
3281 
3282 
3283 
3284         case URX_BACKSLASH_Z:          // Test for end of Input
3285             if (fp->fInputIdx < fAnchorLimit) {
3286                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3287             } else {
3288                 fHitEnd = TRUE;
3289                 fRequireEnd = TRUE;
3290             }
3291             break;
3292 
3293 
3294 
3295         case URX_STATIC_SETREF:
3296             {
3297                 // Test input character against one of the predefined sets
3298                 //    (Word Characters, for example)
3299                 // The high bit of the op value is a flag for the match polarity.
3300                 //    0:   success if input char is in set.
3301                 //    1:   success if input char is not in set.
3302                 if (fp->fInputIdx >= fActiveLimit) {
3303                     fHitEnd = TRUE;
3304                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3305                     break;
3306                 }
3307 
3308                 UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET);
3309                 opValue &= ~URX_NEG_SET;
3310                 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
3311 
3312                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3313                 UChar32 c = UTEXT_NEXT32(fInputText);
3314                 if (c < 256) {
3315                     Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
3316                     if (s8->contains(c)) {
3317                         success = !success;
3318                     }
3319                 } else {
3320                     const UnicodeSet *s = fPattern->fStaticSets[opValue];
3321                     if (s->contains(c)) {
3322                         success = !success;
3323                     }
3324                 }
3325                 if (success) {
3326                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3327                 } else {
3328                     // the character wasn't in the set.
3329                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3330                 }
3331             }
3332             break;
3333 
3334 
3335         case URX_STAT_SETREF_N:
3336             {
3337                 // Test input character for NOT being a member of  one of
3338                 //    the predefined sets (Word Characters, for example)
3339                 if (fp->fInputIdx >= fActiveLimit) {
3340                     fHitEnd = TRUE;
3341                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3342                     break;
3343                 }
3344 
3345                 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
3346 
3347                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3348 
3349                 UChar32 c = UTEXT_NEXT32(fInputText);
3350                 if (c < 256) {
3351                     Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
3352                     if (s8->contains(c) == FALSE) {
3353                         fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3354                         break;
3355                     }
3356                 } else {
3357                     const UnicodeSet *s = fPattern->fStaticSets[opValue];
3358                     if (s->contains(c) == FALSE) {
3359                         fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3360                         break;
3361                     }
3362                 }
3363                 // the character wasn't in the set.
3364                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3365             }
3366             break;
3367 
3368 
3369         case URX_SETREF:
3370             if (fp->fInputIdx >= fActiveLimit) {
3371                 fHitEnd = TRUE;
3372                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3373                 break;
3374             } else {
3375                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3376 
3377                 // There is input left.  Pick up one char and test it for set membership.
3378                 UChar32 c = UTEXT_NEXT32(fInputText);
3379                 U_ASSERT(opValue > 0 && opValue < fSets->size());
3380                 if (c<256) {
3381                     Regex8BitSet *s8 = &fPattern->fSets8[opValue];
3382                     if (s8->contains(c)) {
3383                         fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3384                         break;
3385                     }
3386                 } else {
3387                     UnicodeSet *s = (UnicodeSet *)fSets->elementAt(opValue);
3388                     if (s->contains(c)) {
3389                         // The character is in the set.  A Match.
3390                         fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3391                         break;
3392                     }
3393                 }
3394 
3395                 // the character wasn't in the set.
3396                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3397             }
3398             break;
3399 
3400 
3401         case URX_DOTANY:
3402             {
3403                 // . matches anything, but stops at end-of-line.
3404                 if (fp->fInputIdx >= fActiveLimit) {
3405                     // At end of input.  Match failed.  Backtrack out.
3406                     fHitEnd = TRUE;
3407                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3408                     break;
3409                 }
3410 
3411                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3412 
3413                 // There is input left.  Advance over one char, unless we've hit end-of-line
3414                 UChar32 c = UTEXT_NEXT32(fInputText);
3415                 if (isLineTerminator(c)) {
3416                     // End of line in normal mode.   . does not match.
3417                         fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3418                     break;
3419                 }
3420                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3421             }
3422             break;
3423 
3424 
3425         case URX_DOTANY_ALL:
3426             {
3427                 // ., in dot-matches-all (including new lines) mode
3428                 if (fp->fInputIdx >= fActiveLimit) {
3429                     // At end of input.  Match failed.  Backtrack out.
3430                     fHitEnd = TRUE;
3431                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3432                     break;
3433                 }
3434 
3435                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3436 
3437                 // There is input left.  Advance over one char, except if we are
3438                 //   at a cr/lf, advance over both of them.
3439                 UChar32 c;
3440                 c = UTEXT_NEXT32(fInputText);
3441                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3442                 if (c==0x0d && fp->fInputIdx < fActiveLimit) {
3443                     // In the case of a CR/LF, we need to advance over both.
3444                     UChar32 nextc = UTEXT_CURRENT32(fInputText);
3445                     if (nextc == 0x0a) {
3446                         (void)UTEXT_NEXT32(fInputText);
3447                         fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3448                     }
3449                 }
3450             }
3451             break;
3452 
3453 
3454         case URX_DOTANY_UNIX:
3455             {
3456                 // '.' operator, matches all, but stops at end-of-line.
3457                 //   UNIX_LINES mode, so 0x0a is the only recognized line ending.
3458                 if (fp->fInputIdx >= fActiveLimit) {
3459                     // At end of input.  Match failed.  Backtrack out.
3460                     fHitEnd = TRUE;
3461                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3462                     break;
3463                 }
3464 
3465                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3466 
3467                 // There is input left.  Advance over one char, unless we've hit end-of-line
3468                 UChar32 c = UTEXT_NEXT32(fInputText);
3469                 if (c == 0x0a) {
3470                     // End of line in normal mode.   '.' does not match the \n
3471                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3472                 } else {
3473                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3474                 }
3475             }
3476             break;
3477 
3478 
3479         case URX_JMP:
3480             fp->fPatIdx = opValue;
3481             break;
3482 
3483         case URX_FAIL:
3484             isMatch = FALSE;
3485             goto breakFromLoop;
3486 
3487         case URX_JMP_SAV:
3488             U_ASSERT(opValue < fPattern->fCompiledPat->size());
3489             fp = StateSave(fp, fp->fPatIdx, status);       // State save to loc following current
3490             fp->fPatIdx = opValue;                         // Then JMP.
3491             break;
3492 
3493         case URX_JMP_SAV_X:
3494             // This opcode is used with (x)+, when x can match a zero length string.
3495             // Same as JMP_SAV, except conditional on the match having made forward progress.
3496             // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the
3497             //   data address of the input position at the start of the loop.
3498             {
3499                 U_ASSERT(opValue > 0 && opValue < fPattern->fCompiledPat->size());
3500                 int32_t  stoOp = (int32_t)pat[opValue-1];
3501                 U_ASSERT(URX_TYPE(stoOp) == URX_STO_INP_LOC);
3502                 int32_t  frameLoc = URX_VAL(stoOp);
3503                 U_ASSERT(frameLoc >= 0 && frameLoc < fFrameSize);
3504                 int64_t prevInputIdx = fp->fExtra[frameLoc];
3505                 U_ASSERT(prevInputIdx <= fp->fInputIdx);
3506                 if (prevInputIdx < fp->fInputIdx) {
3507                     // The match did make progress.  Repeat the loop.
3508                     fp = StateSave(fp, fp->fPatIdx, status);  // State save to loc following current
3509                     fp->fPatIdx = opValue;
3510                     fp->fExtra[frameLoc] = fp->fInputIdx;
3511                 }
3512                 // If the input position did not advance, we do nothing here,
3513                 //   execution will fall out of the loop.
3514             }
3515             break;
3516 
3517         case URX_CTR_INIT:
3518             {
3519                 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
3520                 fp->fExtra[opValue] = 0;                 //  Set the loop counter variable to zero
3521 
3522                 // Pick up the three extra operands that CTR_INIT has, and
3523                 //    skip the pattern location counter past
3524                 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
3525                 fp->fPatIdx += 3;
3526                 int32_t loopLoc  = URX_VAL(pat[instrOperandLoc]);
3527                 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
3528                 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
3529                 U_ASSERT(minCount>=0);
3530                 U_ASSERT(maxCount>=minCount || maxCount==-1);
3531                 U_ASSERT(loopLoc>=fp->fPatIdx);
3532 
3533                 if (minCount == 0) {
3534                     fp = StateSave(fp, loopLoc+1, status);
3535                 }
3536                 if (maxCount == -1) {
3537                     fp->fExtra[opValue+1] = fp->fInputIdx;   //  For loop breaking.
3538                 } else if (maxCount == 0) {
3539                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3540                 }
3541             }
3542             break;
3543 
3544         case URX_CTR_LOOP:
3545             {
3546                 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
3547                 int32_t initOp = (int32_t)pat[opValue];
3548                 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT);
3549                 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
3550                 int32_t minCount  = (int32_t)pat[opValue+2];
3551                 int32_t maxCount  = (int32_t)pat[opValue+3];
3552                 (*pCounter)++;
3553                 if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) {
3554                     U_ASSERT(*pCounter == maxCount);
3555                     break;
3556                 }
3557                 if (*pCounter >= minCount) {
3558                     if (maxCount == -1) {
3559                         // Loop has no hard upper bound.
3560                         // Check that it is progressing through the input, break if it is not.
3561                         int64_t *pLastInputIdx =  &fp->fExtra[URX_VAL(initOp) + 1];
3562                         if (fp->fInputIdx == *pLastInputIdx) {
3563                             break;
3564                         } else {
3565                             *pLastInputIdx = fp->fInputIdx;
3566                         }
3567                     }
3568                     fp = StateSave(fp, fp->fPatIdx, status);
3569                 } else {
3570                     // Increment time-out counter. (StateSave() does it if count >= minCount)
3571                     fTickCounter--;
3572                     if (fTickCounter <= 0) {
3573                         IncrementTime(status);    // Re-initializes fTickCounter
3574                     }
3575                 }
3576 
3577                 fp->fPatIdx = opValue + 4;    // Loop back.
3578             }
3579             break;
3580 
3581         case URX_CTR_INIT_NG:
3582             {
3583                 // Initialize a non-greedy loop
3584                 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
3585                 fp->fExtra[opValue] = 0;                 //  Set the loop counter variable to zero
3586 
3587                 // Pick up the three extra operands that CTR_INIT_NG has, and
3588                 //    skip the pattern location counter past
3589                 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
3590                 fp->fPatIdx += 3;
3591                 int32_t loopLoc  = URX_VAL(pat[instrOperandLoc]);
3592                 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
3593                 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
3594                 U_ASSERT(minCount>=0);
3595                 U_ASSERT(maxCount>=minCount || maxCount==-1);
3596                 U_ASSERT(loopLoc>fp->fPatIdx);
3597                 if (maxCount == -1) {
3598                     fp->fExtra[opValue+1] = fp->fInputIdx;   //  Save initial input index for loop breaking.
3599                 }
3600 
3601                 if (minCount == 0) {
3602                     if (maxCount != 0) {
3603                         fp = StateSave(fp, fp->fPatIdx, status);
3604                     }
3605                     fp->fPatIdx = loopLoc+1;   // Continue with stuff after repeated block
3606                 }
3607             }
3608             break;
3609 
3610         case URX_CTR_LOOP_NG:
3611             {
3612                 // Non-greedy {min, max} loops
3613                 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
3614                 int32_t initOp = (int32_t)pat[opValue];
3615                 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT_NG);
3616                 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
3617                 int32_t minCount  = (int32_t)pat[opValue+2];
3618                 int32_t maxCount  = (int32_t)pat[opValue+3];
3619 
3620                 (*pCounter)++;
3621                 if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) {
3622                     // The loop has matched the maximum permitted number of times.
3623                     //   Break out of here with no action.  Matching will
3624                     //   continue with the following pattern.
3625                     U_ASSERT(*pCounter == maxCount);
3626                     break;
3627                 }
3628 
3629                 if (*pCounter < minCount) {
3630                     // We haven't met the minimum number of matches yet.
3631                     //   Loop back for another one.
3632                     fp->fPatIdx = opValue + 4;    // Loop back.
3633                     // Increment time-out counter. (StateSave() does it if count >= minCount)
3634                     fTickCounter--;
3635                     if (fTickCounter <= 0) {
3636                         IncrementTime(status);    // Re-initializes fTickCounter
3637                     }
3638                 } else {
3639                     // We do have the minimum number of matches.
3640 
3641                     // If there is no upper bound on the loop iterations, check that the input index
3642                     // is progressing, and stop the loop if it is not.
3643                     if (maxCount == -1) {
3644                         int64_t *pLastInputIdx =  &fp->fExtra[URX_VAL(initOp) + 1];
3645                         if (fp->fInputIdx == *pLastInputIdx) {
3646                             break;
3647                         }
3648                         *pLastInputIdx = fp->fInputIdx;
3649                     }
3650 
3651                     // Loop Continuation: we will fall into the pattern following the loop
3652                     //   (non-greedy, don't execute loop body first), but first do
3653                     //   a state save to the top of the loop, so that a match failure
3654                     //   in the following pattern will try another iteration of the loop.
3655                     fp = StateSave(fp, opValue + 4, status);
3656                 }
3657             }
3658             break;
3659 
3660         case URX_STO_SP:
3661             U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
3662             fData[opValue] = fStack->size();
3663             break;
3664 
3665         case URX_LD_SP:
3666             {
3667                 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
3668                 int32_t newStackSize = (int32_t)fData[opValue];
3669                 U_ASSERT(newStackSize <= fStack->size());
3670                 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
3671                 if (newFP == (int64_t *)fp) {
3672                     break;
3673                 }
3674                 int32_t j;
3675                 for (j=0; j<fFrameSize; j++) {
3676                     newFP[j] = ((int64_t *)fp)[j];
3677                 }
3678                 fp = (REStackFrame *)newFP;
3679                 fStack->setSize(newStackSize);
3680             }
3681             break;
3682 
3683         case URX_BACKREF:
3684             {
3685                 U_ASSERT(opValue < fFrameSize);
3686                 int64_t groupStartIdx = fp->fExtra[opValue];
3687                 int64_t groupEndIdx   = fp->fExtra[opValue+1];
3688                 U_ASSERT(groupStartIdx <= groupEndIdx);
3689                 if (groupStartIdx < 0) {
3690                     // This capture group has not participated in the match thus far,
3691                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no match.
3692                     break;
3693                 }
3694                 UTEXT_SETNATIVEINDEX(fAltInputText, groupStartIdx);
3695                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3696 
3697                 //   Note: if the capture group match was of an empty string the backref
3698                 //         match succeeds.  Verified by testing:  Perl matches succeed
3699                 //         in this case, so we do too.
3700 
3701                 UBool success = TRUE;
3702                 for (;;) {
3703                     if (utext_getNativeIndex(fAltInputText) >= groupEndIdx) {
3704                         success = TRUE;
3705                         break;
3706                     }
3707                     if (utext_getNativeIndex(fInputText) >= fActiveLimit) {
3708                         success = FALSE;
3709                         fHitEnd = TRUE;
3710                         break;
3711                     }
3712                     UChar32 captureGroupChar = utext_next32(fAltInputText);
3713                     UChar32 inputChar = utext_next32(fInputText);
3714                     if (inputChar != captureGroupChar) {
3715                         success = FALSE;
3716                         break;
3717                     }
3718                 }
3719 
3720                 if (success) {
3721                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3722                 } else {
3723                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3724                 }
3725             }
3726             break;
3727 
3728 
3729 
3730         case URX_BACKREF_I:
3731             {
3732                 U_ASSERT(opValue < fFrameSize);
3733                 int64_t groupStartIdx = fp->fExtra[opValue];
3734                 int64_t groupEndIdx   = fp->fExtra[opValue+1];
3735                 U_ASSERT(groupStartIdx <= groupEndIdx);
3736                 if (groupStartIdx < 0) {
3737                     // This capture group has not participated in the match thus far,
3738                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no match.
3739                     break;
3740                 }
3741                 utext_setNativeIndex(fAltInputText, groupStartIdx);
3742                 utext_setNativeIndex(fInputText, fp->fInputIdx);
3743                 CaseFoldingUTextIterator captureGroupItr(*fAltInputText);
3744                 CaseFoldingUTextIterator inputItr(*fInputText);
3745 
3746                 //   Note: if the capture group match was of an empty string the backref
3747                 //         match succeeds.  Verified by testing:  Perl matches succeed
3748                 //         in this case, so we do too.
3749 
3750                 UBool success = TRUE;
3751                 for (;;) {
3752                     if (!captureGroupItr.inExpansion() && utext_getNativeIndex(fAltInputText) >= groupEndIdx) {
3753                         success = TRUE;
3754                         break;
3755                     }
3756                     if (!inputItr.inExpansion() && utext_getNativeIndex(fInputText) >= fActiveLimit) {
3757                         success = FALSE;
3758                         fHitEnd = TRUE;
3759                         break;
3760                     }
3761                     UChar32 captureGroupChar = captureGroupItr.next();
3762                     UChar32 inputChar = inputItr.next();
3763                     if (inputChar != captureGroupChar) {
3764                         success = FALSE;
3765                         break;
3766                     }
3767                 }
3768 
3769                 if (success && inputItr.inExpansion()) {
3770                     // We otained a match by consuming part of a string obtained from
3771                     // case-folding a single code point of the input text.
3772                     // This does not count as an overall match.
3773                     success = FALSE;
3774                 }
3775 
3776                 if (success) {
3777                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3778                 } else {
3779                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3780                 }
3781 
3782             }
3783             break;
3784 
3785         case URX_STO_INP_LOC:
3786             {
3787                 U_ASSERT(opValue >= 0 && opValue < fFrameSize);
3788                 fp->fExtra[opValue] = fp->fInputIdx;
3789             }
3790             break;
3791 
3792         case URX_JMPX:
3793             {
3794                 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
3795                 fp->fPatIdx += 1;
3796                 int32_t dataLoc  = URX_VAL(pat[instrOperandLoc]);
3797                 U_ASSERT(dataLoc >= 0 && dataLoc < fFrameSize);
3798                 int64_t savedInputIdx = fp->fExtra[dataLoc];
3799                 U_ASSERT(savedInputIdx <= fp->fInputIdx);
3800                 if (savedInputIdx < fp->fInputIdx) {
3801                     fp->fPatIdx = opValue;                               // JMP
3802                 } else {
3803                      fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no progress in loop.
3804                 }
3805             }
3806             break;
3807 
3808         case URX_LA_START:
3809             {
3810                 // Entering a lookahead block.
3811                 // Save Stack Ptr, Input Pos.
3812                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
3813                 fData[opValue]   = fStack->size();
3814                 fData[opValue+1] = fp->fInputIdx;
3815                 fActiveStart     = fLookStart;          // Set the match region change for
3816                 fActiveLimit     = fLookLimit;          //   transparent bounds.
3817             }
3818             break;
3819 
3820         case URX_LA_END:
3821             {
3822                 // Leaving a look-ahead block.
3823                 //  restore Stack Ptr, Input Pos to positions they had on entry to block.
3824                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
3825                 int32_t stackSize = fStack->size();
3826                 int32_t newStackSize =(int32_t)fData[opValue];
3827                 U_ASSERT(stackSize >= newStackSize);
3828                 if (stackSize > newStackSize) {
3829                     // Copy the current top frame back to the new (cut back) top frame.
3830                     //   This makes the capture groups from within the look-ahead
3831                     //   expression available.
3832                     int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
3833                     int32_t j;
3834                     for (j=0; j<fFrameSize; j++) {
3835                         newFP[j] = ((int64_t *)fp)[j];
3836                     }
3837                     fp = (REStackFrame *)newFP;
3838                     fStack->setSize(newStackSize);
3839                 }
3840                 fp->fInputIdx = fData[opValue+1];
3841 
3842                 // Restore the active region bounds in the input string; they may have
3843                 //    been changed because of transparent bounds on a Region.
3844                 fActiveStart = fRegionStart;
3845                 fActiveLimit = fRegionLimit;
3846             }
3847             break;
3848 
3849         case URX_ONECHAR_I:
3850             // Case insensitive one char.  The char from the pattern is already case folded.
3851             // Input text is not, but case folding the input can not reduce two or more code
3852             // points to one.
3853             if (fp->fInputIdx < fActiveLimit) {
3854                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3855 
3856                 UChar32 c = UTEXT_NEXT32(fInputText);
3857                 if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) {
3858                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3859                     break;
3860                 }
3861             } else {
3862                 fHitEnd = TRUE;
3863             }
3864 
3865             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3866             break;
3867 
3868         case URX_STRING_I:
3869             {
3870                 // Case-insensitive test input against a literal string.
3871                 // Strings require two slots in the compiled pattern, one for the
3872                 //   offset to the string text, and one for the length.
3873                 //   The compiled string has already been case folded.
3874                 {
3875                     const UChar *patternString = litText + opValue;
3876                     int32_t      patternStringIdx  = 0;
3877 
3878                     op      = (int32_t)pat[fp->fPatIdx];
3879                     fp->fPatIdx++;
3880                     opType  = URX_TYPE(op);
3881                     opValue = URX_VAL(op);
3882                     U_ASSERT(opType == URX_STRING_LEN);
3883                     int32_t patternStringLen = opValue;  // Length of the string from the pattern.
3884 
3885 
3886                     UChar32   cPattern;
3887                     UChar32   cText;
3888                     UBool     success = TRUE;
3889 
3890                     UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3891                     CaseFoldingUTextIterator inputIterator(*fInputText);
3892                     while (patternStringIdx < patternStringLen) {
3893                         if (!inputIterator.inExpansion() && UTEXT_GETNATIVEINDEX(fInputText) >= fActiveLimit) {
3894                             success = FALSE;
3895                             fHitEnd = TRUE;
3896                             break;
3897                         }
3898                         U16_NEXT(patternString, patternStringIdx, patternStringLen, cPattern);
3899                         cText = inputIterator.next();
3900                         if (cText != cPattern) {
3901                             success = FALSE;
3902                             break;
3903                         }
3904                     }
3905                     if (inputIterator.inExpansion()) {
3906                         success = FALSE;
3907                     }
3908 
3909                     if (success) {
3910                         fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3911                     } else {
3912                         fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3913                     }
3914                 }
3915             }
3916             break;
3917 
3918         case URX_LB_START:
3919             {
3920                 // Entering a look-behind block.
3921                 // Save Stack Ptr, Input Pos.
3922                 //   TODO:  implement transparent bounds.  Ticket #6067
3923                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
3924                 fData[opValue]   = fStack->size();
3925                 fData[opValue+1] = fp->fInputIdx;
3926                 // Init the variable containing the start index for attempted matches.
3927                 fData[opValue+2] = -1;
3928                 // Save input string length, then reset to pin any matches to end at
3929                 //   the current position.
3930                 fData[opValue+3] = fActiveLimit;
3931                 fActiveLimit     = fp->fInputIdx;
3932             }
3933             break;
3934 
3935 
3936         case URX_LB_CONT:
3937             {
3938                 // Positive Look-Behind, at top of loop checking for matches of LB expression
3939                 //    at all possible input starting positions.
3940 
3941                 // Fetch the min and max possible match lengths.  They are the operands
3942                 //   of this op in the pattern.
3943                 int32_t minML = (int32_t)pat[fp->fPatIdx++];
3944                 int32_t maxML = (int32_t)pat[fp->fPatIdx++];
3945                 if (!UTEXT_USES_U16(fInputText)) {
3946                     // utf-8 fix to maximum match length. The pattern compiler assumes utf-16.
3947                     // The max length need not be exact; it just needs to be >= actual maximum.
3948                     maxML *= 3;
3949                 }
3950                 U_ASSERT(minML <= maxML);
3951                 U_ASSERT(minML >= 0);
3952 
3953                 // Fetch (from data) the last input index where a match was attempted.
3954                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
3955                 int64_t  &lbStartIdx = fData[opValue+2];
3956                 if (lbStartIdx < 0) {
3957                     // First time through loop.
3958                     lbStartIdx = fp->fInputIdx - minML;
3959                     if (lbStartIdx > 0) {
3960                         // move index to a code point boudary, if it's not on one already.
3961                         UTEXT_SETNATIVEINDEX(fInputText, lbStartIdx);
3962                         lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
3963                     }
3964                 } else {
3965                     // 2nd through nth time through the loop.
3966                     // Back up start position for match by one.
3967                     if (lbStartIdx == 0) {
3968                         (lbStartIdx)--;
3969                     } else {
3970                         UTEXT_SETNATIVEINDEX(fInputText, lbStartIdx);
3971                         (void)UTEXT_PREVIOUS32(fInputText);
3972                         lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
3973                     }
3974                 }
3975 
3976                 if (lbStartIdx < 0 || lbStartIdx < fp->fInputIdx - maxML) {
3977                     // We have tried all potential match starting points without
3978                     //  getting a match.  Backtrack out, and out of the
3979                     //   Look Behind altogether.
3980                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3981                     int64_t restoreInputLen = fData[opValue+3];
3982                     U_ASSERT(restoreInputLen >= fActiveLimit);
3983                     U_ASSERT(restoreInputLen <= fInputLength);
3984                     fActiveLimit = restoreInputLen;
3985                     break;
3986                 }
3987 
3988                 //    Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
3989                 //      (successful match will fall off the end of the loop.)
3990                 fp = StateSave(fp, fp->fPatIdx-3, status);
3991                 fp->fInputIdx = lbStartIdx;
3992             }
3993             break;
3994 
3995         case URX_LB_END:
3996             // End of a look-behind block, after a successful match.
3997             {
3998                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
3999                 if (fp->fInputIdx != fActiveLimit) {
4000                     //  The look-behind expression matched, but the match did not
4001                     //    extend all the way to the point that we are looking behind from.
4002                     //  FAIL out of here, which will take us back to the LB_CONT, which
4003                     //     will retry the match starting at another position or fail
4004                     //     the look-behind altogether, whichever is appropriate.
4005                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4006                     break;
4007                 }
4008 
4009                 // Look-behind match is good.  Restore the orignal input string length,
4010                 //   which had been truncated to pin the end of the lookbehind match to the
4011                 //   position being looked-behind.
4012                 int64_t originalInputLen = fData[opValue+3];
4013                 U_ASSERT(originalInputLen >= fActiveLimit);
4014                 U_ASSERT(originalInputLen <= fInputLength);
4015                 fActiveLimit = originalInputLen;
4016             }
4017             break;
4018 
4019 
4020         case URX_LBN_CONT:
4021             {
4022                 // Negative Look-Behind, at top of loop checking for matches of LB expression
4023                 //    at all possible input starting positions.
4024 
4025                 // Fetch the extra parameters of this op.
4026                 int32_t minML       = (int32_t)pat[fp->fPatIdx++];
4027                 int32_t maxML       = (int32_t)pat[fp->fPatIdx++];
4028                 if (!UTEXT_USES_U16(fInputText)) {
4029                     // utf-8 fix to maximum match length. The pattern compiler assumes utf-16.
4030                     // The max length need not be exact; it just needs to be >= actual maximum.
4031                     maxML *= 3;
4032                 }
4033                 int32_t continueLoc = (int32_t)pat[fp->fPatIdx++];
4034                         continueLoc = URX_VAL(continueLoc);
4035                 U_ASSERT(minML <= maxML);
4036                 U_ASSERT(minML >= 0);
4037                 U_ASSERT(continueLoc > fp->fPatIdx);
4038 
4039                 // Fetch (from data) the last input index where a match was attempted.
4040                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
4041                 int64_t  &lbStartIdx = fData[opValue+2];
4042                 if (lbStartIdx < 0) {
4043                     // First time through loop.
4044                     lbStartIdx = fp->fInputIdx - minML;
4045                     if (lbStartIdx > 0) {
4046                         // move index to a code point boudary, if it's not on one already.
4047                         UTEXT_SETNATIVEINDEX(fInputText, lbStartIdx);
4048                         lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
4049                     }
4050                 } else {
4051                     // 2nd through nth time through the loop.
4052                     // Back up start position for match by one.
4053                     if (lbStartIdx == 0) {
4054                         (lbStartIdx)--;
4055                     } else {
4056                         UTEXT_SETNATIVEINDEX(fInputText, lbStartIdx);
4057                         (void)UTEXT_PREVIOUS32(fInputText);
4058                         lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
4059                     }
4060                 }
4061 
4062                 if (lbStartIdx < 0 || lbStartIdx < fp->fInputIdx - maxML) {
4063                     // We have tried all potential match starting points without
4064                     //  getting a match, which means that the negative lookbehind as
4065                     //  a whole has succeeded.  Jump forward to the continue location
4066                     int64_t restoreInputLen = fData[opValue+3];
4067                     U_ASSERT(restoreInputLen >= fActiveLimit);
4068                     U_ASSERT(restoreInputLen <= fInputLength);
4069                     fActiveLimit = restoreInputLen;
4070                     fp->fPatIdx = continueLoc;
4071                     break;
4072                 }
4073 
4074                 //    Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
4075                 //      (successful match will cause a FAIL out of the loop altogether.)
4076                 fp = StateSave(fp, fp->fPatIdx-4, status);
4077                 fp->fInputIdx = lbStartIdx;
4078             }
4079             break;
4080 
4081         case URX_LBN_END:
4082             // End of a negative look-behind block, after a successful match.
4083             {
4084                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
4085                 if (fp->fInputIdx != fActiveLimit) {
4086                     //  The look-behind expression matched, but the match did not
4087                     //    extend all the way to the point that we are looking behind from.
4088                     //  FAIL out of here, which will take us back to the LB_CONT, which
4089                     //     will retry the match starting at another position or succeed
4090                     //     the look-behind altogether, whichever is appropriate.
4091                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4092                     break;
4093                 }
4094 
4095                 // Look-behind expression matched, which means look-behind test as
4096                 //   a whole Fails
4097 
4098                 //   Restore the orignal input string length, which had been truncated
4099                 //   inorder to pin the end of the lookbehind match
4100                 //   to the position being looked-behind.
4101                 int64_t originalInputLen = fData[opValue+3];
4102                 U_ASSERT(originalInputLen >= fActiveLimit);
4103                 U_ASSERT(originalInputLen <= fInputLength);
4104                 fActiveLimit = originalInputLen;
4105 
4106                 // Restore original stack position, discarding any state saved
4107                 //   by the successful pattern match.
4108                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
4109                 int32_t newStackSize = (int32_t)fData[opValue];
4110                 U_ASSERT(fStack->size() > newStackSize);
4111                 fStack->setSize(newStackSize);
4112 
4113                 //  FAIL, which will take control back to someplace
4114                 //  prior to entering the look-behind test.
4115                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4116             }
4117             break;
4118 
4119 
4120         case URX_LOOP_SR_I:
4121             // Loop Initialization for the optimized implementation of
4122             //     [some character set]*
4123             //   This op scans through all matching input.
4124             //   The following LOOP_C op emulates stack unwinding if the following pattern fails.
4125             {
4126                 U_ASSERT(opValue > 0 && opValue < fSets->size());
4127                 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
4128                 UnicodeSet   *s  = (UnicodeSet *)fSets->elementAt(opValue);
4129 
4130                 // Loop through input, until either the input is exhausted or
4131                 //   we reach a character that is not a member of the set.
4132                 int64_t ix = fp->fInputIdx;
4133                 UTEXT_SETNATIVEINDEX(fInputText, ix);
4134                 for (;;) {
4135                     if (ix >= fActiveLimit) {
4136                         fHitEnd = TRUE;
4137                         break;
4138                     }
4139                     UChar32 c = UTEXT_NEXT32(fInputText);
4140                     if (c<256) {
4141                         if (s8->contains(c) == FALSE) {
4142                             break;
4143                         }
4144                     } else {
4145                         if (s->contains(c) == FALSE) {
4146                             break;
4147                         }
4148                     }
4149                     ix = UTEXT_GETNATIVEINDEX(fInputText);
4150                 }
4151 
4152                 // If there were no matching characters, skip over the loop altogether.
4153                 //   The loop doesn't run at all, a * op always succeeds.
4154                 if (ix == fp->fInputIdx) {
4155                     fp->fPatIdx++;   // skip the URX_LOOP_C op.
4156                     break;
4157                 }
4158 
4159                 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
4160                 //   must follow.  It's operand is the stack location
4161                 //   that holds the starting input index for the match of this [set]*
4162                 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
4163                 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
4164                 int32_t stackLoc = URX_VAL(loopcOp);
4165                 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
4166                 fp->fExtra[stackLoc] = fp->fInputIdx;
4167                 fp->fInputIdx = ix;
4168 
4169                 // Save State to the URX_LOOP_C op that follows this one,
4170                 //   so that match failures in the following code will return to there.
4171                 //   Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
4172                 fp = StateSave(fp, fp->fPatIdx, status);
4173                 fp->fPatIdx++;
4174             }
4175             break;
4176 
4177 
4178         case URX_LOOP_DOT_I:
4179             // Loop Initialization for the optimized implementation of .*
4180             //   This op scans through all remaining input.
4181             //   The following LOOP_C op emulates stack unwinding if the following pattern fails.
4182             {
4183                 // Loop through input until the input is exhausted (we reach an end-of-line)
4184                 // In DOTALL mode, we can just go straight to the end of the input.
4185                 int64_t ix;
4186                 if ((opValue & 1) == 1) {
4187                     // Dot-matches-All mode.  Jump straight to the end of the string.
4188                     ix = fActiveLimit;
4189                     fHitEnd = TRUE;
4190                 } else {
4191                     // NOT DOT ALL mode.  Line endings do not match '.'
4192                     // Scan forward until a line ending or end of input.
4193                     ix = fp->fInputIdx;
4194                     UTEXT_SETNATIVEINDEX(fInputText, ix);
4195                     for (;;) {
4196                         if (ix >= fActiveLimit) {
4197                             fHitEnd = TRUE;
4198                             break;
4199                         }
4200                         UChar32 c = UTEXT_NEXT32(fInputText);
4201                         if ((c & 0x7f) <= 0x29) {          // Fast filter of non-new-line-s
4202                             if ((c == 0x0a) ||             //  0x0a is newline in both modes.
4203                                (((opValue & 2) == 0) &&    // IF not UNIX_LINES mode
4204                                     isLineTerminator(c))) {
4205                                 //  char is a line ending.  Exit the scanning loop.
4206                                 break;
4207                             }
4208                         }
4209                         ix = UTEXT_GETNATIVEINDEX(fInputText);
4210                     }
4211                 }
4212 
4213                 // If there were no matching characters, skip over the loop altogether.
4214                 //   The loop doesn't run at all, a * op always succeeds.
4215                 if (ix == fp->fInputIdx) {
4216                     fp->fPatIdx++;   // skip the URX_LOOP_C op.
4217                     break;
4218                 }
4219 
4220                 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
4221                 //   must follow.  It's operand is the stack location
4222                 //   that holds the starting input index for the match of this .*
4223                 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
4224                 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
4225                 int32_t stackLoc = URX_VAL(loopcOp);
4226                 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
4227                 fp->fExtra[stackLoc] = fp->fInputIdx;
4228                 fp->fInputIdx = ix;
4229 
4230                 // Save State to the URX_LOOP_C op that follows this one,
4231                 //   so that match failures in the following code will return to there.
4232                 //   Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
4233                 fp = StateSave(fp, fp->fPatIdx, status);
4234                 fp->fPatIdx++;
4235             }
4236             break;
4237 
4238 
4239         case URX_LOOP_C:
4240             {
4241                 U_ASSERT(opValue>=0 && opValue<fFrameSize);
4242                 backSearchIndex = fp->fExtra[opValue];
4243                 U_ASSERT(backSearchIndex <= fp->fInputIdx);
4244                 if (backSearchIndex == fp->fInputIdx) {
4245                     // We've backed up the input idx to the point that the loop started.
4246                     // The loop is done.  Leave here without saving state.
4247                     //  Subsequent failures won't come back here.
4248                     break;
4249                 }
4250                 // Set up for the next iteration of the loop, with input index
4251                 //   backed up by one from the last time through,
4252                 //   and a state save to this instruction in case the following code fails again.
4253                 //   (We're going backwards because this loop emulates stack unwinding, not
4254                 //    the initial scan forward.)
4255                 U_ASSERT(fp->fInputIdx > 0);
4256                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
4257                 UChar32 prevC = UTEXT_PREVIOUS32(fInputText);
4258                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
4259 
4260                 UChar32 twoPrevC = UTEXT_PREVIOUS32(fInputText);
4261                 if (prevC == 0x0a &&
4262                     fp->fInputIdx > backSearchIndex &&
4263                     twoPrevC == 0x0d) {
4264                     int32_t prevOp = (int32_t)pat[fp->fPatIdx-2];
4265                     if (URX_TYPE(prevOp) == URX_LOOP_DOT_I) {
4266                         // .*, stepping back over CRLF pair.
4267                         fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
4268                     }
4269                 }
4270 
4271 
4272                 fp = StateSave(fp, fp->fPatIdx-1, status);
4273             }
4274             break;
4275 
4276 
4277 
4278         default:
4279             // Trouble.  The compiled pattern contains an entry with an
4280             //           unrecognized type tag.
4281             U_ASSERT(FALSE);
4282         }
4283 
4284         if (U_FAILURE(status)) {
4285             isMatch = FALSE;
4286             break;
4287         }
4288     }
4289 
4290 breakFromLoop:
4291     fMatch = isMatch;
4292     if (isMatch) {
4293         fLastMatchEnd = fMatchEnd;
4294         fMatchStart   = startIdx;
4295         fMatchEnd     = fp->fInputIdx;
4296     }
4297 
4298 #ifdef REGEX_RUN_DEBUG
4299     if (fTraceDebug) {
4300         if (isMatch) {
4301             printf("Match.  start=%ld   end=%ld\n\n", fMatchStart, fMatchEnd);
4302         } else {
4303             printf("No match\n\n");
4304         }
4305     }
4306 #endif
4307 
4308     fFrame = fp;                // The active stack frame when the engine stopped.
4309                                 //   Contains the capture group results that we need to
4310                                 //    access later.
4311     return;
4312 }
4313 
4314 
4315 //--------------------------------------------------------------------------------
4316 //
4317 //   MatchChunkAt   This is the actual matching engine. Like MatchAt, but with the
4318 //                  assumption that the entire string is available in the UText's
4319 //                  chunk buffer. For now, that means we can use int32_t indexes,
4320 //                  except for anything that needs to be saved (like group starts
4321 //                  and ends).
4322 //
4323 //                  startIdx:    begin matching a this index.
4324 //                  toEnd:       if true, match must extend to end of the input region
4325 //
4326 //--------------------------------------------------------------------------------
MatchChunkAt(int32_t startIdx,UBool toEnd,UErrorCode & status)4327 void RegexMatcher::MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &status) {
4328     UBool       isMatch  = FALSE;      // True if the we have a match.
4329 
4330     int32_t     backSearchIndex = INT32_MAX; // used after greedy single-character matches for searching backwards
4331 
4332     int32_t     op;                    // Operation from the compiled pattern, split into
4333     int32_t     opType;                //    the opcode
4334     int32_t     opValue;               //    and the operand value.
4335 
4336 #ifdef REGEX_RUN_DEBUG
4337     if (fTraceDebug) {
4338         printf("MatchAt(startIdx=%d)\n", startIdx);
4339         printf("Original Pattern: \"%s\"\n", CStr(StringFromUText(fPattern->fPattern))());
4340         printf("Input String:     \"%s\"\n\n", CStr(StringFromUText(fInputText))());
4341     }
4342 #endif
4343 
4344     if (U_FAILURE(status)) {
4345         return;
4346     }
4347 
4348     //  Cache frequently referenced items from the compiled pattern
4349     //
4350     int64_t             *pat           = fPattern->fCompiledPat->getBuffer();
4351 
4352     const UChar         *litText       = fPattern->fLiteralText.getBuffer();
4353     UVector             *fSets         = fPattern->fSets;
4354 
4355     const UChar         *inputBuf      = fInputText->chunkContents;
4356 
4357     fFrameSize = fPattern->fFrameSize;
4358     REStackFrame        *fp            = resetStack();
4359     if (U_FAILURE(fDeferredStatus)) {
4360         status = fDeferredStatus;
4361         return;
4362     }
4363 
4364     fp->fPatIdx   = 0;
4365     fp->fInputIdx = startIdx;
4366 
4367     // Zero out the pattern's static data
4368     int32_t i;
4369     for (i = 0; i<fPattern->fDataSize; i++) {
4370         fData[i] = 0;
4371     }
4372 
4373     //
4374     //  Main loop for interpreting the compiled pattern.
4375     //  One iteration of the loop per pattern operation performed.
4376     //
4377     for (;;) {
4378         op      = (int32_t)pat[fp->fPatIdx];
4379         opType  = URX_TYPE(op);
4380         opValue = URX_VAL(op);
4381 #ifdef REGEX_RUN_DEBUG
4382         if (fTraceDebug) {
4383             UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
4384             printf("inputIdx=%ld   inputChar=%x   sp=%3ld   activeLimit=%ld  ", fp->fInputIdx,
4385                    UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(), fActiveLimit);
4386             fPattern->dumpOp(fp->fPatIdx);
4387         }
4388 #endif
4389         fp->fPatIdx++;
4390 
4391         switch (opType) {
4392 
4393 
4394         case URX_NOP:
4395             break;
4396 
4397 
4398         case URX_BACKTRACK:
4399             // Force a backtrack.  In some circumstances, the pattern compiler
4400             //   will notice that the pattern can't possibly match anything, and will
4401             //   emit one of these at that point.
4402             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4403             break;
4404 
4405 
4406         case URX_ONECHAR:
4407             if (fp->fInputIdx < fActiveLimit) {
4408                 UChar32 c;
4409                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4410                 if (c == opValue) {
4411                     break;
4412                 }
4413             } else {
4414                 fHitEnd = TRUE;
4415             }
4416             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4417             break;
4418 
4419 
4420         case URX_STRING:
4421             {
4422                 // Test input against a literal string.
4423                 // Strings require two slots in the compiled pattern, one for the
4424                 //   offset to the string text, and one for the length.
4425                 int32_t   stringStartIdx = opValue;
4426                 int32_t   stringLen;
4427 
4428                 op      = (int32_t)pat[fp->fPatIdx];     // Fetch the second operand
4429                 fp->fPatIdx++;
4430                 opType    = URX_TYPE(op);
4431                 stringLen = URX_VAL(op);
4432                 U_ASSERT(opType == URX_STRING_LEN);
4433                 U_ASSERT(stringLen >= 2);
4434 
4435                 const UChar * pInp = inputBuf + fp->fInputIdx;
4436                 const UChar * pInpLimit = inputBuf + fActiveLimit;
4437                 const UChar * pPat = litText+stringStartIdx;
4438                 const UChar * pEnd = pInp + stringLen;
4439                 UBool success = TRUE;
4440                 while (pInp < pEnd) {
4441                     if (pInp >= pInpLimit) {
4442                         fHitEnd = TRUE;
4443                         success = FALSE;
4444                         break;
4445                     }
4446                     if (*pInp++ != *pPat++) {
4447                         success = FALSE;
4448                         break;
4449                     }
4450                 }
4451 
4452                 if (success) {
4453                     fp->fInputIdx += stringLen;
4454                 } else {
4455                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4456                 }
4457             }
4458             break;
4459 
4460 
4461         case URX_STATE_SAVE:
4462             fp = StateSave(fp, opValue, status);
4463             break;
4464 
4465 
4466         case URX_END:
4467             // The match loop will exit via this path on a successful match,
4468             //   when we reach the end of the pattern.
4469             if (toEnd && fp->fInputIdx != fActiveLimit) {
4470                 // The pattern matched, but not to the end of input.  Try some more.
4471                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4472                 break;
4473             }
4474             isMatch = TRUE;
4475             goto  breakFromLoop;
4476 
4477             // Start and End Capture stack frame variables are laid out out like this:
4478             //  fp->fExtra[opValue]  - The start of a completed capture group
4479             //             opValue+1 - The end   of a completed capture group
4480             //             opValue+2 - the start of a capture group whose end
4481             //                          has not yet been reached (and might not ever be).
4482         case URX_START_CAPTURE:
4483             U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
4484             fp->fExtra[opValue+2] = fp->fInputIdx;
4485             break;
4486 
4487 
4488         case URX_END_CAPTURE:
4489             U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
4490             U_ASSERT(fp->fExtra[opValue+2] >= 0);            // Start pos for this group must be set.
4491             fp->fExtra[opValue]   = fp->fExtra[opValue+2];   // Tentative start becomes real.
4492             fp->fExtra[opValue+1] = fp->fInputIdx;           // End position
4493             U_ASSERT(fp->fExtra[opValue] <= fp->fExtra[opValue+1]);
4494             break;
4495 
4496 
4497         case URX_DOLLAR:                   //  $, test for End of line
4498             //     or for position before new line at end of input
4499             if (fp->fInputIdx < fAnchorLimit-2) {
4500                 // We are no where near the end of input.  Fail.
4501                 //   This is the common case.  Keep it first.
4502                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4503                 break;
4504             }
4505             if (fp->fInputIdx >= fAnchorLimit) {
4506                 // We really are at the end of input.  Success.
4507                 fHitEnd = TRUE;
4508                 fRequireEnd = TRUE;
4509                 break;
4510             }
4511 
4512             // If we are positioned just before a new-line that is located at the
4513             //   end of input, succeed.
4514             if (fp->fInputIdx == fAnchorLimit-1) {
4515                 UChar32 c;
4516                 U16_GET(inputBuf, fAnchorStart, fp->fInputIdx, fAnchorLimit, c);
4517 
4518                 if (isLineTerminator(c)) {
4519                     if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp->fInputIdx-1]==0x0d)) {
4520                         // At new-line at end of input. Success
4521                         fHitEnd = TRUE;
4522                         fRequireEnd = TRUE;
4523                         break;
4524                     }
4525                 }
4526             } else if (fp->fInputIdx == fAnchorLimit-2 &&
4527                 inputBuf[fp->fInputIdx]==0x0d && inputBuf[fp->fInputIdx+1]==0x0a) {
4528                     fHitEnd = TRUE;
4529                     fRequireEnd = TRUE;
4530                     break;                         // At CR/LF at end of input.  Success
4531             }
4532 
4533             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4534 
4535             break;
4536 
4537 
4538         case URX_DOLLAR_D:                   //  $, test for End of Line, in UNIX_LINES mode.
4539             if (fp->fInputIdx >= fAnchorLimit-1) {
4540                 // Either at the last character of input, or off the end.
4541                 if (fp->fInputIdx == fAnchorLimit-1) {
4542                     // At last char of input.  Success if it's a new line.
4543                     if (inputBuf[fp->fInputIdx] == 0x0a) {
4544                         fHitEnd = TRUE;
4545                         fRequireEnd = TRUE;
4546                         break;
4547                     }
4548                 } else {
4549                     // Off the end of input.  Success.
4550                     fHitEnd = TRUE;
4551                     fRequireEnd = TRUE;
4552                     break;
4553                 }
4554             }
4555 
4556             // Not at end of input.  Back-track out.
4557             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4558             break;
4559 
4560 
4561         case URX_DOLLAR_M:                //  $, test for End of line in multi-line mode
4562             {
4563                 if (fp->fInputIdx >= fAnchorLimit) {
4564                     // We really are at the end of input.  Success.
4565                     fHitEnd = TRUE;
4566                     fRequireEnd = TRUE;
4567                     break;
4568                 }
4569                 // If we are positioned just before a new-line, succeed.
4570                 // It makes no difference where the new-line is within the input.
4571                 UChar32 c = inputBuf[fp->fInputIdx];
4572                 if (isLineTerminator(c)) {
4573                     // At a line end, except for the odd chance of  being in the middle of a CR/LF sequence
4574                     //  In multi-line mode, hitting a new-line just before the end of input does not
4575                     //   set the hitEnd or requireEnd flags
4576                     if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp->fInputIdx-1]==0x0d)) {
4577                         break;
4578                     }
4579                 }
4580                 // not at a new line.  Fail.
4581                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4582             }
4583             break;
4584 
4585 
4586         case URX_DOLLAR_MD:                //  $, test for End of line in multi-line and UNIX_LINES mode
4587             {
4588                 if (fp->fInputIdx >= fAnchorLimit) {
4589                     // We really are at the end of input.  Success.
4590                     fHitEnd = TRUE;
4591                     fRequireEnd = TRUE;  // Java set requireEnd in this case, even though
4592                     break;               //   adding a new-line would not lose the match.
4593                 }
4594                 // If we are not positioned just before a new-line, the test fails; backtrack out.
4595                 // It makes no difference where the new-line is within the input.
4596                 if (inputBuf[fp->fInputIdx] != 0x0a) {
4597                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4598                 }
4599             }
4600             break;
4601 
4602 
4603         case URX_CARET:                    //  ^, test for start of line
4604             if (fp->fInputIdx != fAnchorStart) {
4605                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4606             }
4607             break;
4608 
4609 
4610         case URX_CARET_M:                   //  ^, test for start of line in mulit-line mode
4611             {
4612                 if (fp->fInputIdx == fAnchorStart) {
4613                     // We are at the start input.  Success.
4614                     break;
4615                 }
4616                 // Check whether character just before the current pos is a new-line
4617                 //   unless we are at the end of input
4618                 UChar  c = inputBuf[fp->fInputIdx - 1];
4619                 if ((fp->fInputIdx < fAnchorLimit) &&
4620                     isLineTerminator(c)) {
4621                     //  It's a new-line.  ^ is true.  Success.
4622                     //  TODO:  what should be done with positions between a CR and LF?
4623                     break;
4624                 }
4625                 // Not at the start of a line.  Fail.
4626                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4627             }
4628             break;
4629 
4630 
4631         case URX_CARET_M_UNIX:       //  ^, test for start of line in mulit-line + Unix-line mode
4632             {
4633                 U_ASSERT(fp->fInputIdx >= fAnchorStart);
4634                 if (fp->fInputIdx <= fAnchorStart) {
4635                     // We are at the start input.  Success.
4636                     break;
4637                 }
4638                 // Check whether character just before the current pos is a new-line
4639                 U_ASSERT(fp->fInputIdx <= fAnchorLimit);
4640                 UChar  c = inputBuf[fp->fInputIdx - 1];
4641                 if (c != 0x0a) {
4642                     // Not at the start of a line.  Back-track out.
4643                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4644                 }
4645             }
4646             break;
4647 
4648         case URX_BACKSLASH_B:          // Test for word boundaries
4649             {
4650                 UBool success = isChunkWordBoundary((int32_t)fp->fInputIdx);
4651                 success ^= (UBool)(opValue != 0);     // flip sense for \B
4652                 if (!success) {
4653                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4654                 }
4655             }
4656             break;
4657 
4658 
4659         case URX_BACKSLASH_BU:          // Test for word boundaries, Unicode-style
4660             {
4661                 UBool success = isUWordBoundary(fp->fInputIdx);
4662                 success ^= (UBool)(opValue != 0);     // flip sense for \B
4663                 if (!success) {
4664                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4665                 }
4666             }
4667             break;
4668 
4669 
4670         case URX_BACKSLASH_D:            // Test for decimal digit
4671             {
4672                 if (fp->fInputIdx >= fActiveLimit) {
4673                     fHitEnd = TRUE;
4674                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4675                     break;
4676                 }
4677 
4678                 UChar32 c;
4679                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4680                 int8_t ctype = u_charType(c);     // TODO:  make a unicode set for this.  Will be faster.
4681                 UBool success = (ctype == U_DECIMAL_DIGIT_NUMBER);
4682                 success ^= (UBool)(opValue != 0);        // flip sense for \D
4683                 if (!success) {
4684                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4685                 }
4686             }
4687             break;
4688 
4689 
4690         case URX_BACKSLASH_G:          // Test for position at end of previous match
4691             if (!((fMatch && fp->fInputIdx==fMatchEnd) || (fMatch==FALSE && fp->fInputIdx==fActiveStart))) {
4692                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4693             }
4694             break;
4695 
4696 
4697         case URX_BACKSLASH_H:            // Test for \h, horizontal white space.
4698             {
4699                 if (fp->fInputIdx >= fActiveLimit) {
4700                     fHitEnd = TRUE;
4701                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4702                     break;
4703                 }
4704                 UChar32 c;
4705                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4706                 int8_t ctype = u_charType(c);
4707                 UBool success = (ctype == U_SPACE_SEPARATOR || c == 9);  // SPACE_SEPARATOR || TAB
4708                 success ^= (UBool)(opValue != 0);        // flip sense for \H
4709                 if (!success) {
4710                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4711                 }
4712             }
4713             break;
4714 
4715 
4716         case URX_BACKSLASH_R:            // Test for \R, any line break sequence.
4717             {
4718                 if (fp->fInputIdx >= fActiveLimit) {
4719                     fHitEnd = TRUE;
4720                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4721                     break;
4722                 }
4723                 UChar32 c;
4724                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4725                 if (isLineTerminator(c)) {
4726                     if (c == 0x0d && fp->fInputIdx < fActiveLimit) {
4727                         // Check for CR/LF sequence. Consume both together when found.
4728                         UChar c2;
4729                         U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c2);
4730                         if (c2 != 0x0a) {
4731                             U16_PREV(inputBuf, 0, fp->fInputIdx, c2);
4732                         }
4733                     }
4734                 } else {
4735                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4736                 }
4737             }
4738             break;
4739 
4740 
4741         case URX_BACKSLASH_V:         // Any single code point line ending.
4742             {
4743                 if (fp->fInputIdx >= fActiveLimit) {
4744                     fHitEnd = TRUE;
4745                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4746                     break;
4747                 }
4748                 UChar32 c;
4749                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4750                 UBool success = isLineTerminator(c);
4751                 success ^= (UBool)(opValue != 0);        // flip sense for \V
4752                 if (!success) {
4753                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4754                 }
4755             }
4756             break;
4757 
4758 
4759 
4760         case URX_BACKSLASH_X:
4761         //  Match a Grapheme, as defined by Unicode TR 29.
4762         //  Differs slightly from Perl, which consumes combining marks independently
4763         //    of context.
4764         {
4765 
4766             // Fail if at end of input
4767             if (fp->fInputIdx >= fActiveLimit) {
4768                 fHitEnd = TRUE;
4769                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4770                 break;
4771             }
4772 
4773             // Examine (and consume) the current char.
4774             //   Dispatch into a little state machine, based on the char.
4775             UChar32  c;
4776             U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4777             UnicodeSet **sets = fPattern->fStaticSets;
4778             if (sets[URX_GC_NORMAL]->contains(c))  goto GC_Extend;
4779             if (sets[URX_GC_CONTROL]->contains(c)) goto GC_Control;
4780             if (sets[URX_GC_L]->contains(c))       goto GC_L;
4781             if (sets[URX_GC_LV]->contains(c))      goto GC_V;
4782             if (sets[URX_GC_LVT]->contains(c))     goto GC_T;
4783             if (sets[URX_GC_V]->contains(c))       goto GC_V;
4784             if (sets[URX_GC_T]->contains(c))       goto GC_T;
4785             goto GC_Extend;
4786 
4787 
4788 
4789 GC_L:
4790             if (fp->fInputIdx >= fActiveLimit)         goto GC_Done;
4791             U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4792             if (sets[URX_GC_L]->contains(c))       goto GC_L;
4793             if (sets[URX_GC_LV]->contains(c))      goto GC_V;
4794             if (sets[URX_GC_LVT]->contains(c))     goto GC_T;
4795             if (sets[URX_GC_V]->contains(c))       goto GC_V;
4796             U16_PREV(inputBuf, 0, fp->fInputIdx, c);
4797             goto GC_Extend;
4798 
4799 GC_V:
4800             if (fp->fInputIdx >= fActiveLimit)         goto GC_Done;
4801             U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4802             if (sets[URX_GC_V]->contains(c))       goto GC_V;
4803             if (sets[URX_GC_T]->contains(c))       goto GC_T;
4804             U16_PREV(inputBuf, 0, fp->fInputIdx, c);
4805             goto GC_Extend;
4806 
4807 GC_T:
4808             if (fp->fInputIdx >= fActiveLimit)         goto GC_Done;
4809             U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4810             if (sets[URX_GC_T]->contains(c))       goto GC_T;
4811             U16_PREV(inputBuf, 0, fp->fInputIdx, c);
4812             goto GC_Extend;
4813 
4814 GC_Extend:
4815             // Combining characters are consumed here
4816             for (;;) {
4817                 if (fp->fInputIdx >= fActiveLimit) {
4818                     break;
4819                 }
4820                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4821                 if (sets[URX_GC_EXTEND]->contains(c) == FALSE) {
4822                     U16_BACK_1(inputBuf, 0, fp->fInputIdx);
4823                     break;
4824                 }
4825             }
4826             goto GC_Done;
4827 
4828 GC_Control:
4829             // Most control chars stand alone (don't combine with combining chars),
4830             //   except for that CR/LF sequence is a single grapheme cluster.
4831             if (c == 0x0d && fp->fInputIdx < fActiveLimit && inputBuf[fp->fInputIdx] == 0x0a) {
4832                 fp->fInputIdx++;
4833             }
4834 
4835 GC_Done:
4836             if (fp->fInputIdx >= fActiveLimit) {
4837                 fHitEnd = TRUE;
4838             }
4839             break;
4840         }
4841 
4842 
4843 
4844 
4845         case URX_BACKSLASH_Z:          // Test for end of Input
4846             if (fp->fInputIdx < fAnchorLimit) {
4847                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4848             } else {
4849                 fHitEnd = TRUE;
4850                 fRequireEnd = TRUE;
4851             }
4852             break;
4853 
4854 
4855 
4856         case URX_STATIC_SETREF:
4857             {
4858                 // Test input character against one of the predefined sets
4859                 //    (Word Characters, for example)
4860                 // The high bit of the op value is a flag for the match polarity.
4861                 //    0:   success if input char is in set.
4862                 //    1:   success if input char is not in set.
4863                 if (fp->fInputIdx >= fActiveLimit) {
4864                     fHitEnd = TRUE;
4865                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4866                     break;
4867                 }
4868 
4869                 UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET);
4870                 opValue &= ~URX_NEG_SET;
4871                 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
4872 
4873                 UChar32 c;
4874                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4875                 if (c < 256) {
4876                     Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
4877                     if (s8->contains(c)) {
4878                         success = !success;
4879                     }
4880                 } else {
4881                     const UnicodeSet *s = fPattern->fStaticSets[opValue];
4882                     if (s->contains(c)) {
4883                         success = !success;
4884                     }
4885                 }
4886                 if (!success) {
4887                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4888                 }
4889             }
4890             break;
4891 
4892 
4893         case URX_STAT_SETREF_N:
4894             {
4895                 // Test input character for NOT being a member of  one of
4896                 //    the predefined sets (Word Characters, for example)
4897                 if (fp->fInputIdx >= fActiveLimit) {
4898                     fHitEnd = TRUE;
4899                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4900                     break;
4901                 }
4902 
4903                 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
4904 
4905                 UChar32  c;
4906                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4907                 if (c < 256) {
4908                     Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
4909                     if (s8->contains(c) == FALSE) {
4910                         break;
4911                     }
4912                 } else {
4913                     const UnicodeSet *s = fPattern->fStaticSets[opValue];
4914                     if (s->contains(c) == FALSE) {
4915                         break;
4916                     }
4917                 }
4918                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4919             }
4920             break;
4921 
4922 
4923         case URX_SETREF:
4924             {
4925                 if (fp->fInputIdx >= fActiveLimit) {
4926                     fHitEnd = TRUE;
4927                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4928                     break;
4929                 }
4930 
4931                 U_ASSERT(opValue > 0 && opValue < fSets->size());
4932 
4933                 // There is input left.  Pick up one char and test it for set membership.
4934                 UChar32  c;
4935                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4936                 if (c<256) {
4937                     Regex8BitSet *s8 = &fPattern->fSets8[opValue];
4938                     if (s8->contains(c)) {
4939                         // The character is in the set.  A Match.
4940                         break;
4941                     }
4942                 } else {
4943                     UnicodeSet *s = (UnicodeSet *)fSets->elementAt(opValue);
4944                     if (s->contains(c)) {
4945                         // The character is in the set.  A Match.
4946                         break;
4947                     }
4948                 }
4949 
4950                 // the character wasn't in the set.
4951                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4952             }
4953             break;
4954 
4955 
4956         case URX_DOTANY:
4957             {
4958                 // . matches anything, but stops at end-of-line.
4959                 if (fp->fInputIdx >= fActiveLimit) {
4960                     // At end of input.  Match failed.  Backtrack out.
4961                     fHitEnd = TRUE;
4962                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4963                     break;
4964                 }
4965 
4966                 // There is input left.  Advance over one char, unless we've hit end-of-line
4967                 UChar32  c;
4968                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4969                 if (isLineTerminator(c)) {
4970                     // End of line in normal mode.   . does not match.
4971                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4972                     break;
4973                 }
4974             }
4975             break;
4976 
4977 
4978         case URX_DOTANY_ALL:
4979             {
4980                 // . in dot-matches-all (including new lines) mode
4981                 if (fp->fInputIdx >= fActiveLimit) {
4982                     // At end of input.  Match failed.  Backtrack out.
4983                     fHitEnd = TRUE;
4984                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4985                     break;
4986                 }
4987 
4988                 // There is input left.  Advance over one char, except if we are
4989                 //   at a cr/lf, advance over both of them.
4990                 UChar32 c;
4991                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4992                 if (c==0x0d && fp->fInputIdx < fActiveLimit) {
4993                     // In the case of a CR/LF, we need to advance over both.
4994                     if (inputBuf[fp->fInputIdx] == 0x0a) {
4995                         U16_FWD_1(inputBuf, fp->fInputIdx, fActiveLimit);
4996                     }
4997                 }
4998             }
4999             break;
5000 
5001 
5002         case URX_DOTANY_UNIX:
5003             {
5004                 // '.' operator, matches all, but stops at end-of-line.
5005                 //   UNIX_LINES mode, so 0x0a is the only recognized line ending.
5006                 if (fp->fInputIdx >= fActiveLimit) {
5007                     // At end of input.  Match failed.  Backtrack out.
5008                     fHitEnd = TRUE;
5009                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5010                     break;
5011                 }
5012 
5013                 // There is input left.  Advance over one char, unless we've hit end-of-line
5014                 UChar32 c;
5015                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
5016                 if (c == 0x0a) {
5017                     // End of line in normal mode.   '.' does not match the \n
5018                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5019                 }
5020             }
5021             break;
5022 
5023 
5024         case URX_JMP:
5025             fp->fPatIdx = opValue;
5026             break;
5027 
5028         case URX_FAIL:
5029             isMatch = FALSE;
5030             goto breakFromLoop;
5031 
5032         case URX_JMP_SAV:
5033             U_ASSERT(opValue < fPattern->fCompiledPat->size());
5034             fp = StateSave(fp, fp->fPatIdx, status);       // State save to loc following current
5035             fp->fPatIdx = opValue;                         // Then JMP.
5036             break;
5037 
5038         case URX_JMP_SAV_X:
5039             // This opcode is used with (x)+, when x can match a zero length string.
5040             // Same as JMP_SAV, except conditional on the match having made forward progress.
5041             // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the
5042             //   data address of the input position at the start of the loop.
5043             {
5044                 U_ASSERT(opValue > 0 && opValue < fPattern->fCompiledPat->size());
5045                 int32_t  stoOp = (int32_t)pat[opValue-1];
5046                 U_ASSERT(URX_TYPE(stoOp) == URX_STO_INP_LOC);
5047                 int32_t  frameLoc = URX_VAL(stoOp);
5048                 U_ASSERT(frameLoc >= 0 && frameLoc < fFrameSize);
5049                 int32_t prevInputIdx = (int32_t)fp->fExtra[frameLoc];
5050                 U_ASSERT(prevInputIdx <= fp->fInputIdx);
5051                 if (prevInputIdx < fp->fInputIdx) {
5052                     // The match did make progress.  Repeat the loop.
5053                     fp = StateSave(fp, fp->fPatIdx, status);  // State save to loc following current
5054                     fp->fPatIdx = opValue;
5055                     fp->fExtra[frameLoc] = fp->fInputIdx;
5056                 }
5057                 // If the input position did not advance, we do nothing here,
5058                 //   execution will fall out of the loop.
5059             }
5060             break;
5061 
5062         case URX_CTR_INIT:
5063             {
5064                 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
5065                 fp->fExtra[opValue] = 0;                 //  Set the loop counter variable to zero
5066 
5067                 // Pick up the three extra operands that CTR_INIT has, and
5068                 //    skip the pattern location counter past
5069                 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
5070                 fp->fPatIdx += 3;
5071                 int32_t loopLoc  = URX_VAL(pat[instrOperandLoc]);
5072                 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
5073                 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
5074                 U_ASSERT(minCount>=0);
5075                 U_ASSERT(maxCount>=minCount || maxCount==-1);
5076                 U_ASSERT(loopLoc>=fp->fPatIdx);
5077 
5078                 if (minCount == 0) {
5079                     fp = StateSave(fp, loopLoc+1, status);
5080                 }
5081                 if (maxCount == -1) {
5082                     fp->fExtra[opValue+1] = fp->fInputIdx;   //  For loop breaking.
5083                 } else if (maxCount == 0) {
5084                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5085                 }
5086             }
5087             break;
5088 
5089         case URX_CTR_LOOP:
5090             {
5091                 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
5092                 int32_t initOp = (int32_t)pat[opValue];
5093                 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT);
5094                 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
5095                 int32_t minCount  = (int32_t)pat[opValue+2];
5096                 int32_t maxCount  = (int32_t)pat[opValue+3];
5097                 (*pCounter)++;
5098                 if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) {
5099                     U_ASSERT(*pCounter == maxCount);
5100                     break;
5101                 }
5102                 if (*pCounter >= minCount) {
5103                     if (maxCount == -1) {
5104                         // Loop has no hard upper bound.
5105                         // Check that it is progressing through the input, break if it is not.
5106                         int64_t *pLastInputIdx =  &fp->fExtra[URX_VAL(initOp) + 1];
5107                         if (fp->fInputIdx == *pLastInputIdx) {
5108                             break;
5109                         } else {
5110                             *pLastInputIdx = fp->fInputIdx;
5111                         }
5112                     }
5113                     fp = StateSave(fp, fp->fPatIdx, status);
5114                 } else {
5115                     // Increment time-out counter. (StateSave() does it if count >= minCount)
5116                     fTickCounter--;
5117                     if (fTickCounter <= 0) {
5118                         IncrementTime(status);    // Re-initializes fTickCounter
5119                     }
5120                 }
5121                 fp->fPatIdx = opValue + 4;    // Loop back.
5122             }
5123             break;
5124 
5125         case URX_CTR_INIT_NG:
5126             {
5127                 // Initialize a non-greedy loop
5128                 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
5129                 fp->fExtra[opValue] = 0;                 //  Set the loop counter variable to zero
5130 
5131                 // Pick up the three extra operands that CTR_INIT_NG has, and
5132                 //    skip the pattern location counter past
5133                 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
5134                 fp->fPatIdx += 3;
5135                 int32_t loopLoc  = URX_VAL(pat[instrOperandLoc]);
5136                 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
5137                 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
5138                 U_ASSERT(minCount>=0);
5139                 U_ASSERT(maxCount>=minCount || maxCount==-1);
5140                 U_ASSERT(loopLoc>fp->fPatIdx);
5141                 if (maxCount == -1) {
5142                     fp->fExtra[opValue+1] = fp->fInputIdx;   //  Save initial input index for loop breaking.
5143                 }
5144 
5145                 if (minCount == 0) {
5146                     if (maxCount != 0) {
5147                         fp = StateSave(fp, fp->fPatIdx, status);
5148                     }
5149                     fp->fPatIdx = loopLoc+1;   // Continue with stuff after repeated block
5150                 }
5151             }
5152             break;
5153 
5154         case URX_CTR_LOOP_NG:
5155             {
5156                 // Non-greedy {min, max} loops
5157                 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
5158                 int32_t initOp = (int32_t)pat[opValue];
5159                 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT_NG);
5160                 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
5161                 int32_t minCount  = (int32_t)pat[opValue+2];
5162                 int32_t maxCount  = (int32_t)pat[opValue+3];
5163 
5164                 (*pCounter)++;
5165                 if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) {
5166                     // The loop has matched the maximum permitted number of times.
5167                     //   Break out of here with no action.  Matching will
5168                     //   continue with the following pattern.
5169                     U_ASSERT(*pCounter == maxCount);
5170                     break;
5171                 }
5172 
5173                 if (*pCounter < minCount) {
5174                     // We haven't met the minimum number of matches yet.
5175                     //   Loop back for another one.
5176                     fp->fPatIdx = opValue + 4;    // Loop back.
5177                     fTickCounter--;
5178                     if (fTickCounter <= 0) {
5179                         IncrementTime(status);    // Re-initializes fTickCounter
5180                     }
5181                 } else {
5182                     // We do have the minimum number of matches.
5183 
5184                     // If there is no upper bound on the loop iterations, check that the input index
5185                     // is progressing, and stop the loop if it is not.
5186                     if (maxCount == -1) {
5187                         int64_t *pLastInputIdx =  &fp->fExtra[URX_VAL(initOp) + 1];
5188                         if (fp->fInputIdx == *pLastInputIdx) {
5189                             break;
5190                         }
5191                         *pLastInputIdx = fp->fInputIdx;
5192                     }
5193 
5194                     // Loop Continuation: we will fall into the pattern following the loop
5195                     //   (non-greedy, don't execute loop body first), but first do
5196                     //   a state save to the top of the loop, so that a match failure
5197                     //   in the following pattern will try another iteration of the loop.
5198                     fp = StateSave(fp, opValue + 4, status);
5199                 }
5200             }
5201             break;
5202 
5203         case URX_STO_SP:
5204             U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
5205             fData[opValue] = fStack->size();
5206             break;
5207 
5208         case URX_LD_SP:
5209             {
5210                 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
5211                 int32_t newStackSize = (int32_t)fData[opValue];
5212                 U_ASSERT(newStackSize <= fStack->size());
5213                 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
5214                 if (newFP == (int64_t *)fp) {
5215                     break;
5216                 }
5217                 int32_t j;
5218                 for (j=0; j<fFrameSize; j++) {
5219                     newFP[j] = ((int64_t *)fp)[j];
5220                 }
5221                 fp = (REStackFrame *)newFP;
5222                 fStack->setSize(newStackSize);
5223             }
5224             break;
5225 
5226         case URX_BACKREF:
5227             {
5228                 U_ASSERT(opValue < fFrameSize);
5229                 int64_t groupStartIdx = fp->fExtra[opValue];
5230                 int64_t groupEndIdx   = fp->fExtra[opValue+1];
5231                 U_ASSERT(groupStartIdx <= groupEndIdx);
5232                 int64_t inputIndex = fp->fInputIdx;
5233                 if (groupStartIdx < 0) {
5234                     // This capture group has not participated in the match thus far,
5235                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no match.
5236                     break;
5237                 }
5238                 UBool success = TRUE;
5239                 for (int64_t groupIndex = groupStartIdx; groupIndex < groupEndIdx; ++groupIndex,++inputIndex) {
5240                     if (inputIndex >= fActiveLimit) {
5241                         success = FALSE;
5242                         fHitEnd = TRUE;
5243                         break;
5244                     }
5245                     if (inputBuf[groupIndex] != inputBuf[inputIndex]) {
5246                         success = FALSE;
5247                         break;
5248                     }
5249                 }
5250                 if (success && groupStartIdx < groupEndIdx && U16_IS_LEAD(inputBuf[groupEndIdx-1]) &&
5251                         inputIndex < fActiveLimit && U16_IS_TRAIL(inputBuf[inputIndex])) {
5252                     // Capture group ended with an unpaired lead surrogate.
5253                     // Back reference is not permitted to match lead only of a surrogatge pair.
5254                     success = FALSE;
5255                 }
5256                 if (success) {
5257                     fp->fInputIdx = inputIndex;
5258                 } else {
5259                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5260                 }
5261             }
5262             break;
5263 
5264         case URX_BACKREF_I:
5265             {
5266                 U_ASSERT(opValue < fFrameSize);
5267                 int64_t groupStartIdx = fp->fExtra[opValue];
5268                 int64_t groupEndIdx   = fp->fExtra[opValue+1];
5269                 U_ASSERT(groupStartIdx <= groupEndIdx);
5270                 if (groupStartIdx < 0) {
5271                     // This capture group has not participated in the match thus far,
5272                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no match.
5273                     break;
5274                 }
5275                 CaseFoldingUCharIterator captureGroupItr(inputBuf, groupStartIdx, groupEndIdx);
5276                 CaseFoldingUCharIterator inputItr(inputBuf, fp->fInputIdx, fActiveLimit);
5277 
5278                 //   Note: if the capture group match was of an empty string the backref
5279                 //         match succeeds.  Verified by testing:  Perl matches succeed
5280                 //         in this case, so we do too.
5281 
5282                 UBool success = TRUE;
5283                 for (;;) {
5284                     UChar32 captureGroupChar = captureGroupItr.next();
5285                     if (captureGroupChar == U_SENTINEL) {
5286                         success = TRUE;
5287                         break;
5288                     }
5289                     UChar32 inputChar = inputItr.next();
5290                     if (inputChar == U_SENTINEL) {
5291                         success = FALSE;
5292                         fHitEnd = TRUE;
5293                         break;
5294                     }
5295                     if (inputChar != captureGroupChar) {
5296                         success = FALSE;
5297                         break;
5298                     }
5299                 }
5300 
5301                 if (success && inputItr.inExpansion()) {
5302                     // We otained a match by consuming part of a string obtained from
5303                     // case-folding a single code point of the input text.
5304                     // This does not count as an overall match.
5305                     success = FALSE;
5306                 }
5307 
5308                 if (success) {
5309                     fp->fInputIdx = inputItr.getIndex();
5310                 } else {
5311                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5312                 }
5313             }
5314             break;
5315 
5316         case URX_STO_INP_LOC:
5317             {
5318                 U_ASSERT(opValue >= 0 && opValue < fFrameSize);
5319                 fp->fExtra[opValue] = fp->fInputIdx;
5320             }
5321             break;
5322 
5323         case URX_JMPX:
5324             {
5325                 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
5326                 fp->fPatIdx += 1;
5327                 int32_t dataLoc  = URX_VAL(pat[instrOperandLoc]);
5328                 U_ASSERT(dataLoc >= 0 && dataLoc < fFrameSize);
5329                 int32_t savedInputIdx = (int32_t)fp->fExtra[dataLoc];
5330                 U_ASSERT(savedInputIdx <= fp->fInputIdx);
5331                 if (savedInputIdx < fp->fInputIdx) {
5332                     fp->fPatIdx = opValue;                               // JMP
5333                 } else {
5334                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no progress in loop.
5335                 }
5336             }
5337             break;
5338 
5339         case URX_LA_START:
5340             {
5341                 // Entering a lookahead block.
5342                 // Save Stack Ptr, Input Pos.
5343                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5344                 fData[opValue]   = fStack->size();
5345                 fData[opValue+1] = fp->fInputIdx;
5346                 fActiveStart     = fLookStart;          // Set the match region change for
5347                 fActiveLimit     = fLookLimit;          //   transparent bounds.
5348             }
5349             break;
5350 
5351         case URX_LA_END:
5352             {
5353                 // Leaving a look-ahead block.
5354                 //  restore Stack Ptr, Input Pos to positions they had on entry to block.
5355                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5356                 int32_t stackSize = fStack->size();
5357                 int32_t newStackSize = (int32_t)fData[opValue];
5358                 U_ASSERT(stackSize >= newStackSize);
5359                 if (stackSize > newStackSize) {
5360                     // Copy the current top frame back to the new (cut back) top frame.
5361                     //   This makes the capture groups from within the look-ahead
5362                     //   expression available.
5363                     int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
5364                     int32_t j;
5365                     for (j=0; j<fFrameSize; j++) {
5366                         newFP[j] = ((int64_t *)fp)[j];
5367                     }
5368                     fp = (REStackFrame *)newFP;
5369                     fStack->setSize(newStackSize);
5370                 }
5371                 fp->fInputIdx = fData[opValue+1];
5372 
5373                 // Restore the active region bounds in the input string; they may have
5374                 //    been changed because of transparent bounds on a Region.
5375                 fActiveStart = fRegionStart;
5376                 fActiveLimit = fRegionLimit;
5377             }
5378             break;
5379 
5380         case URX_ONECHAR_I:
5381             if (fp->fInputIdx < fActiveLimit) {
5382                 UChar32 c;
5383                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
5384                 if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) {
5385                     break;
5386                 }
5387             } else {
5388                 fHitEnd = TRUE;
5389             }
5390             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5391             break;
5392 
5393         case URX_STRING_I:
5394             // Case-insensitive test input against a literal string.
5395             // Strings require two slots in the compiled pattern, one for the
5396             //   offset to the string text, and one for the length.
5397             //   The compiled string has already been case folded.
5398             {
5399                 const UChar *patternString = litText + opValue;
5400 
5401                 op      = (int32_t)pat[fp->fPatIdx];
5402                 fp->fPatIdx++;
5403                 opType  = URX_TYPE(op);
5404                 opValue = URX_VAL(op);
5405                 U_ASSERT(opType == URX_STRING_LEN);
5406                 int32_t patternStringLen = opValue;  // Length of the string from the pattern.
5407 
5408                 UChar32      cText;
5409                 UChar32      cPattern;
5410                 UBool        success = TRUE;
5411                 int32_t      patternStringIdx  = 0;
5412                 CaseFoldingUCharIterator inputIterator(inputBuf, fp->fInputIdx, fActiveLimit);
5413                 while (patternStringIdx < patternStringLen) {
5414                     U16_NEXT(patternString, patternStringIdx, patternStringLen, cPattern);
5415                     cText = inputIterator.next();
5416                     if (cText != cPattern) {
5417                         success = FALSE;
5418                         if (cText == U_SENTINEL) {
5419                             fHitEnd = TRUE;
5420                         }
5421                         break;
5422                     }
5423                 }
5424                 if (inputIterator.inExpansion()) {
5425                     success = FALSE;
5426                 }
5427 
5428                 if (success) {
5429                     fp->fInputIdx = inputIterator.getIndex();
5430                 } else {
5431                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5432                 }
5433             }
5434             break;
5435 
5436         case URX_LB_START:
5437             {
5438                 // Entering a look-behind block.
5439                 // Save Stack Ptr, Input Pos.
5440                 //   TODO:  implement transparent bounds.  Ticket #6067
5441                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5442                 fData[opValue]   = fStack->size();
5443                 fData[opValue+1] = fp->fInputIdx;
5444                 // Init the variable containing the start index for attempted matches.
5445                 fData[opValue+2] = -1;
5446                 // Save input string length, then reset to pin any matches to end at
5447                 //   the current position.
5448                 fData[opValue+3] = fActiveLimit;
5449                 fActiveLimit     = fp->fInputIdx;
5450             }
5451             break;
5452 
5453 
5454         case URX_LB_CONT:
5455             {
5456                 // Positive Look-Behind, at top of loop checking for matches of LB expression
5457                 //    at all possible input starting positions.
5458 
5459                 // Fetch the min and max possible match lengths.  They are the operands
5460                 //   of this op in the pattern.
5461                 int32_t minML = (int32_t)pat[fp->fPatIdx++];
5462                 int32_t maxML = (int32_t)pat[fp->fPatIdx++];
5463                 U_ASSERT(minML <= maxML);
5464                 U_ASSERT(minML >= 0);
5465 
5466                 // Fetch (from data) the last input index where a match was attempted.
5467                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5468                 int64_t  &lbStartIdx = fData[opValue+2];
5469                 if (lbStartIdx < 0) {
5470                     // First time through loop.
5471                     lbStartIdx = fp->fInputIdx - minML;
5472                     if (lbStartIdx > 0 && lbStartIdx < fInputLength) {
5473                         U16_SET_CP_START(inputBuf, 0, lbStartIdx);
5474                     }
5475                 } else {
5476                     // 2nd through nth time through the loop.
5477                     // Back up start position for match by one.
5478                     if (lbStartIdx == 0) {
5479                         lbStartIdx--;
5480                     } else {
5481                         U16_BACK_1(inputBuf, 0, lbStartIdx);
5482                     }
5483                 }
5484 
5485                 if (lbStartIdx < 0 || lbStartIdx < fp->fInputIdx - maxML) {
5486                     // We have tried all potential match starting points without
5487                     //  getting a match.  Backtrack out, and out of the
5488                     //   Look Behind altogether.
5489                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5490                     int64_t restoreInputLen = fData[opValue+3];
5491                     U_ASSERT(restoreInputLen >= fActiveLimit);
5492                     U_ASSERT(restoreInputLen <= fInputLength);
5493                     fActiveLimit = restoreInputLen;
5494                     break;
5495                 }
5496 
5497                 //    Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
5498                 //      (successful match will fall off the end of the loop.)
5499                 fp = StateSave(fp, fp->fPatIdx-3, status);
5500                 fp->fInputIdx =  lbStartIdx;
5501             }
5502             break;
5503 
5504         case URX_LB_END:
5505             // End of a look-behind block, after a successful match.
5506             {
5507                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5508                 if (fp->fInputIdx != fActiveLimit) {
5509                     //  The look-behind expression matched, but the match did not
5510                     //    extend all the way to the point that we are looking behind from.
5511                     //  FAIL out of here, which will take us back to the LB_CONT, which
5512                     //     will retry the match starting at another position or fail
5513                     //     the look-behind altogether, whichever is appropriate.
5514                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5515                     break;
5516                 }
5517 
5518                 // Look-behind match is good.  Restore the orignal input string length,
5519                 //   which had been truncated to pin the end of the lookbehind match to the
5520                 //   position being looked-behind.
5521                 int64_t originalInputLen = fData[opValue+3];
5522                 U_ASSERT(originalInputLen >= fActiveLimit);
5523                 U_ASSERT(originalInputLen <= fInputLength);
5524                 fActiveLimit = originalInputLen;
5525             }
5526             break;
5527 
5528 
5529         case URX_LBN_CONT:
5530             {
5531                 // Negative Look-Behind, at top of loop checking for matches of LB expression
5532                 //    at all possible input starting positions.
5533 
5534                 // Fetch the extra parameters of this op.
5535                 int32_t minML       = (int32_t)pat[fp->fPatIdx++];
5536                 int32_t maxML       = (int32_t)pat[fp->fPatIdx++];
5537                 int32_t continueLoc = (int32_t)pat[fp->fPatIdx++];
5538                 continueLoc = URX_VAL(continueLoc);
5539                 U_ASSERT(minML <= maxML);
5540                 U_ASSERT(minML >= 0);
5541                 U_ASSERT(continueLoc > fp->fPatIdx);
5542 
5543                 // Fetch (from data) the last input index where a match was attempted.
5544                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5545                 int64_t  &lbStartIdx = fData[opValue+2];
5546                 if (lbStartIdx < 0) {
5547                     // First time through loop.
5548                     lbStartIdx = fp->fInputIdx - minML;
5549                     if (lbStartIdx > 0 && lbStartIdx < fInputLength) {
5550                         U16_SET_CP_START(inputBuf, 0, lbStartIdx);
5551                     }
5552                 } else {
5553                     // 2nd through nth time through the loop.
5554                     // Back up start position for match by one.
5555                     if (lbStartIdx == 0) {
5556                         lbStartIdx--;   // Because U16_BACK is unsafe starting at 0.
5557                     } else {
5558                         U16_BACK_1(inputBuf, 0, lbStartIdx);
5559                     }
5560                 }
5561 
5562                 if (lbStartIdx < 0 || lbStartIdx < fp->fInputIdx - maxML) {
5563                     // We have tried all potential match starting points without
5564                     //  getting a match, which means that the negative lookbehind as
5565                     //  a whole has succeeded.  Jump forward to the continue location
5566                     int64_t restoreInputLen = fData[opValue+3];
5567                     U_ASSERT(restoreInputLen >= fActiveLimit);
5568                     U_ASSERT(restoreInputLen <= fInputLength);
5569                     fActiveLimit = restoreInputLen;
5570                     fp->fPatIdx = continueLoc;
5571                     break;
5572                 }
5573 
5574                 //    Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
5575                 //      (successful match will cause a FAIL out of the loop altogether.)
5576                 fp = StateSave(fp, fp->fPatIdx-4, status);
5577                 fp->fInputIdx =  lbStartIdx;
5578             }
5579             break;
5580 
5581         case URX_LBN_END:
5582             // End of a negative look-behind block, after a successful match.
5583             {
5584                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5585                 if (fp->fInputIdx != fActiveLimit) {
5586                     //  The look-behind expression matched, but the match did not
5587                     //    extend all the way to the point that we are looking behind from.
5588                     //  FAIL out of here, which will take us back to the LB_CONT, which
5589                     //     will retry the match starting at another position or succeed
5590                     //     the look-behind altogether, whichever is appropriate.
5591                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5592                     break;
5593                 }
5594 
5595                 // Look-behind expression matched, which means look-behind test as
5596                 //   a whole Fails
5597 
5598                 //   Restore the orignal input string length, which had been truncated
5599                 //   inorder to pin the end of the lookbehind match
5600                 //   to the position being looked-behind.
5601                 int64_t originalInputLen = fData[opValue+3];
5602                 U_ASSERT(originalInputLen >= fActiveLimit);
5603                 U_ASSERT(originalInputLen <= fInputLength);
5604                 fActiveLimit = originalInputLen;
5605 
5606                 // Restore original stack position, discarding any state saved
5607                 //   by the successful pattern match.
5608                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5609                 int32_t newStackSize = (int32_t)fData[opValue];
5610                 U_ASSERT(fStack->size() > newStackSize);
5611                 fStack->setSize(newStackSize);
5612 
5613                 //  FAIL, which will take control back to someplace
5614                 //  prior to entering the look-behind test.
5615                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5616             }
5617             break;
5618 
5619 
5620         case URX_LOOP_SR_I:
5621             // Loop Initialization for the optimized implementation of
5622             //     [some character set]*
5623             //   This op scans through all matching input.
5624             //   The following LOOP_C op emulates stack unwinding if the following pattern fails.
5625             {
5626                 U_ASSERT(opValue > 0 && opValue < fSets->size());
5627                 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
5628                 UnicodeSet   *s  = (UnicodeSet *)fSets->elementAt(opValue);
5629 
5630                 // Loop through input, until either the input is exhausted or
5631                 //   we reach a character that is not a member of the set.
5632                 int32_t ix = (int32_t)fp->fInputIdx;
5633                 for (;;) {
5634                     if (ix >= fActiveLimit) {
5635                         fHitEnd = TRUE;
5636                         break;
5637                     }
5638                     UChar32   c;
5639                     U16_NEXT(inputBuf, ix, fActiveLimit, c);
5640                     if (c<256) {
5641                         if (s8->contains(c) == FALSE) {
5642                             U16_BACK_1(inputBuf, 0, ix);
5643                             break;
5644                         }
5645                     } else {
5646                         if (s->contains(c) == FALSE) {
5647                             U16_BACK_1(inputBuf, 0, ix);
5648                             break;
5649                         }
5650                     }
5651                 }
5652 
5653                 // If there were no matching characters, skip over the loop altogether.
5654                 //   The loop doesn't run at all, a * op always succeeds.
5655                 if (ix == fp->fInputIdx) {
5656                     fp->fPatIdx++;   // skip the URX_LOOP_C op.
5657                     break;
5658                 }
5659 
5660                 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
5661                 //   must follow.  It's operand is the stack location
5662                 //   that holds the starting input index for the match of this [set]*
5663                 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
5664                 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
5665                 int32_t stackLoc = URX_VAL(loopcOp);
5666                 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
5667                 fp->fExtra[stackLoc] = fp->fInputIdx;
5668                 fp->fInputIdx = ix;
5669 
5670                 // Save State to the URX_LOOP_C op that follows this one,
5671                 //   so that match failures in the following code will return to there.
5672                 //   Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
5673                 fp = StateSave(fp, fp->fPatIdx, status);
5674                 fp->fPatIdx++;
5675             }
5676             break;
5677 
5678 
5679         case URX_LOOP_DOT_I:
5680             // Loop Initialization for the optimized implementation of .*
5681             //   This op scans through all remaining input.
5682             //   The following LOOP_C op emulates stack unwinding if the following pattern fails.
5683             {
5684                 // Loop through input until the input is exhausted (we reach an end-of-line)
5685                 // In DOTALL mode, we can just go straight to the end of the input.
5686                 int32_t ix;
5687                 if ((opValue & 1) == 1) {
5688                     // Dot-matches-All mode.  Jump straight to the end of the string.
5689                     ix = (int32_t)fActiveLimit;
5690                     fHitEnd = TRUE;
5691                 } else {
5692                     // NOT DOT ALL mode.  Line endings do not match '.'
5693                     // Scan forward until a line ending or end of input.
5694                     ix = (int32_t)fp->fInputIdx;
5695                     for (;;) {
5696                         if (ix >= fActiveLimit) {
5697                             fHitEnd = TRUE;
5698                             break;
5699                         }
5700                         UChar32   c;
5701                         U16_NEXT(inputBuf, ix, fActiveLimit, c);   // c = inputBuf[ix++]
5702                         if ((c & 0x7f) <= 0x29) {          // Fast filter of non-new-line-s
5703                             if ((c == 0x0a) ||             //  0x0a is newline in both modes.
5704                                 (((opValue & 2) == 0) &&    // IF not UNIX_LINES mode
5705                                    isLineTerminator(c))) {
5706                                 //  char is a line ending.  Put the input pos back to the
5707                                 //    line ending char, and exit the scanning loop.
5708                                 U16_BACK_1(inputBuf, 0, ix);
5709                                 break;
5710                             }
5711                         }
5712                     }
5713                 }
5714 
5715                 // If there were no matching characters, skip over the loop altogether.
5716                 //   The loop doesn't run at all, a * op always succeeds.
5717                 if (ix == fp->fInputIdx) {
5718                     fp->fPatIdx++;   // skip the URX_LOOP_C op.
5719                     break;
5720                 }
5721 
5722                 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
5723                 //   must follow.  It's operand is the stack location
5724                 //   that holds the starting input index for the match of this .*
5725                 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
5726                 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
5727                 int32_t stackLoc = URX_VAL(loopcOp);
5728                 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
5729                 fp->fExtra[stackLoc] = fp->fInputIdx;
5730                 fp->fInputIdx = ix;
5731 
5732                 // Save State to the URX_LOOP_C op that follows this one,
5733                 //   so that match failures in the following code will return to there.
5734                 //   Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
5735                 fp = StateSave(fp, fp->fPatIdx, status);
5736                 fp->fPatIdx++;
5737             }
5738             break;
5739 
5740 
5741         case URX_LOOP_C:
5742             {
5743                 U_ASSERT(opValue>=0 && opValue<fFrameSize);
5744                 backSearchIndex = (int32_t)fp->fExtra[opValue];
5745                 U_ASSERT(backSearchIndex <= fp->fInputIdx);
5746                 if (backSearchIndex == fp->fInputIdx) {
5747                     // We've backed up the input idx to the point that the loop started.
5748                     // The loop is done.  Leave here without saving state.
5749                     //  Subsequent failures won't come back here.
5750                     break;
5751                 }
5752                 // Set up for the next iteration of the loop, with input index
5753                 //   backed up by one from the last time through,
5754                 //   and a state save to this instruction in case the following code fails again.
5755                 //   (We're going backwards because this loop emulates stack unwinding, not
5756                 //    the initial scan forward.)
5757                 U_ASSERT(fp->fInputIdx > 0);
5758                 UChar32 prevC;
5759                 U16_PREV(inputBuf, 0, fp->fInputIdx, prevC); // !!!: should this 0 be one of f*Limit?
5760 
5761                 if (prevC == 0x0a &&
5762                     fp->fInputIdx > backSearchIndex &&
5763                     inputBuf[fp->fInputIdx-1] == 0x0d) {
5764                     int32_t prevOp = (int32_t)pat[fp->fPatIdx-2];
5765                     if (URX_TYPE(prevOp) == URX_LOOP_DOT_I) {
5766                         // .*, stepping back over CRLF pair.
5767                         U16_BACK_1(inputBuf, 0, fp->fInputIdx);
5768                     }
5769                 }
5770 
5771 
5772                 fp = StateSave(fp, fp->fPatIdx-1, status);
5773             }
5774             break;
5775 
5776 
5777 
5778         default:
5779             // Trouble.  The compiled pattern contains an entry with an
5780             //           unrecognized type tag.
5781             U_ASSERT(FALSE);
5782         }
5783 
5784         if (U_FAILURE(status)) {
5785             isMatch = FALSE;
5786             break;
5787         }
5788     }
5789 
5790 breakFromLoop:
5791     fMatch = isMatch;
5792     if (isMatch) {
5793         fLastMatchEnd = fMatchEnd;
5794         fMatchStart   = startIdx;
5795         fMatchEnd     = fp->fInputIdx;
5796     }
5797 
5798 #ifdef REGEX_RUN_DEBUG
5799     if (fTraceDebug) {
5800         if (isMatch) {
5801             printf("Match.  start=%ld   end=%ld\n\n", fMatchStart, fMatchEnd);
5802         } else {
5803             printf("No match\n\n");
5804         }
5805     }
5806 #endif
5807 
5808     fFrame = fp;                // The active stack frame when the engine stopped.
5809                                 //   Contains the capture group results that we need to
5810                                 //    access later.
5811 
5812     return;
5813 }
5814 
5815 
5816 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RegexMatcher)
5817 
5818 U_NAMESPACE_END
5819 
5820 #endif  // !UCONFIG_NO_REGULAR_EXPRESSIONS
5821 
5822