1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 **************************************************************************
5 * Copyright (C) 2002-2016 International Business Machines Corporation
6 * and others. All rights reserved.
7 **************************************************************************
8 */
9 //
10 // file: rematch.cpp
11 //
12 // Contains the implementation of class RegexMatcher,
13 // which is one of the main API classes for the ICU regular expression package.
14 //
15
16 #include "unicode/utypes.h"
17 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
18
19 #include "unicode/regex.h"
20 #include "unicode/uniset.h"
21 #include "unicode/uchar.h"
22 #include "unicode/ustring.h"
23 #include "unicode/rbbi.h"
24 #include "unicode/utf.h"
25 #include "unicode/utf16.h"
26 #include "uassert.h"
27 #include "cmemory.h"
28 #include "cstr.h"
29 #include "uvector.h"
30 #include "uvectr32.h"
31 #include "uvectr64.h"
32 #include "regeximp.h"
33 #include "regexst.h"
34 #include "regextxt.h"
35 #include "ucase.h"
36
37 // #include <malloc.h> // Needed for heapcheck testing
38
39
40 U_NAMESPACE_BEGIN
41
42 // Default limit for the size of the back track stack, to avoid system
43 // failures causedby heap exhaustion. Units are in 32 bit words, not bytes.
44 // This value puts ICU's limits higher than most other regexp implementations,
45 // which use recursion rather than the heap, and take more storage per
46 // backtrack point.
47 //
48 static const int32_t DEFAULT_BACKTRACK_STACK_CAPACITY = 8000000;
49
50 // Time limit counter constant.
51 // Time limits for expression evaluation are in terms of quanta of work by
52 // the engine, each of which is 10,000 state saves.
53 // This constant determines that state saves per tick number.
54 static const int32_t TIMER_INITIAL_VALUE = 10000;
55
56
57 // Test for any of the Unicode line terminating characters.
isLineTerminator(UChar32 c)58 static inline UBool isLineTerminator(UChar32 c) {
59 if (c & ~(0x0a | 0x0b | 0x0c | 0x0d | 0x85 | 0x2028 | 0x2029)) {
60 return false;
61 }
62 return (c<=0x0d && c>=0x0a) || c==0x85 || c==0x2028 || c==0x2029;
63 }
64
65 //-----------------------------------------------------------------------------
66 //
67 // Constructor and Destructor
68 //
69 //-----------------------------------------------------------------------------
RegexMatcher(const RegexPattern * pat)70 RegexMatcher::RegexMatcher(const RegexPattern *pat) {
71 fDeferredStatus = U_ZERO_ERROR;
72 init(fDeferredStatus);
73 if (U_FAILURE(fDeferredStatus)) {
74 return;
75 }
76 if (pat==NULL) {
77 fDeferredStatus = U_ILLEGAL_ARGUMENT_ERROR;
78 return;
79 }
80 fPattern = pat;
81 init2(RegexStaticSets::gStaticSets->fEmptyText, fDeferredStatus);
82 }
83
84
85
RegexMatcher(const UnicodeString & regexp,const UnicodeString & input,uint32_t flags,UErrorCode & status)86 RegexMatcher::RegexMatcher(const UnicodeString ®exp, const UnicodeString &input,
87 uint32_t flags, UErrorCode &status) {
88 init(status);
89 if (U_FAILURE(status)) {
90 return;
91 }
92 UParseError pe;
93 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status);
94 fPattern = fPatternOwned;
95
96 UText inputText = UTEXT_INITIALIZER;
97 utext_openConstUnicodeString(&inputText, &input, &status);
98 init2(&inputText, status);
99 utext_close(&inputText);
100
101 fInputUniStrMaybeMutable = TRUE;
102 }
103
104
RegexMatcher(UText * regexp,UText * input,uint32_t flags,UErrorCode & status)105 RegexMatcher::RegexMatcher(UText *regexp, UText *input,
106 uint32_t flags, UErrorCode &status) {
107 init(status);
108 if (U_FAILURE(status)) {
109 return;
110 }
111 UParseError pe;
112 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status);
113 if (U_FAILURE(status)) {
114 return;
115 }
116
117 fPattern = fPatternOwned;
118 init2(input, status);
119 }
120
121
RegexMatcher(const UnicodeString & regexp,uint32_t flags,UErrorCode & status)122 RegexMatcher::RegexMatcher(const UnicodeString ®exp,
123 uint32_t flags, UErrorCode &status) {
124 init(status);
125 if (U_FAILURE(status)) {
126 return;
127 }
128 UParseError pe;
129 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status);
130 if (U_FAILURE(status)) {
131 return;
132 }
133 fPattern = fPatternOwned;
134 init2(RegexStaticSets::gStaticSets->fEmptyText, status);
135 }
136
RegexMatcher(UText * regexp,uint32_t flags,UErrorCode & status)137 RegexMatcher::RegexMatcher(UText *regexp,
138 uint32_t flags, UErrorCode &status) {
139 init(status);
140 if (U_FAILURE(status)) {
141 return;
142 }
143 UParseError pe;
144 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status);
145 if (U_FAILURE(status)) {
146 return;
147 }
148
149 fPattern = fPatternOwned;
150 init2(RegexStaticSets::gStaticSets->fEmptyText, status);
151 }
152
153
154
155
~RegexMatcher()156 RegexMatcher::~RegexMatcher() {
157 delete fStack;
158 if (fData != fSmallData) {
159 uprv_free(fData);
160 fData = NULL;
161 }
162 if (fPatternOwned) {
163 delete fPatternOwned;
164 fPatternOwned = NULL;
165 fPattern = NULL;
166 }
167
168 if (fInput) {
169 delete fInput;
170 }
171 if (fInputText) {
172 utext_close(fInputText);
173 }
174 if (fAltInputText) {
175 utext_close(fAltInputText);
176 }
177
178 #if UCONFIG_NO_BREAK_ITERATION==0
179 delete fWordBreakItr;
180 delete fGCBreakItr;
181 #endif
182 }
183
184 //
185 // init() common initialization for use by all constructors.
186 // Initialize all fields, get the object into a consistent state.
187 // This must be done even when the initial status shows an error,
188 // so that the object is initialized sufficiently well for the destructor
189 // to run safely.
190 //
init(UErrorCode & status)191 void RegexMatcher::init(UErrorCode &status) {
192 fPattern = NULL;
193 fPatternOwned = NULL;
194 fFrameSize = 0;
195 fRegionStart = 0;
196 fRegionLimit = 0;
197 fAnchorStart = 0;
198 fAnchorLimit = 0;
199 fLookStart = 0;
200 fLookLimit = 0;
201 fActiveStart = 0;
202 fActiveLimit = 0;
203 fTransparentBounds = FALSE;
204 fAnchoringBounds = TRUE;
205 fMatch = FALSE;
206 fMatchStart = 0;
207 fMatchEnd = 0;
208 fLastMatchEnd = -1;
209 fAppendPosition = 0;
210 fHitEnd = FALSE;
211 fRequireEnd = FALSE;
212 fStack = NULL;
213 fFrame = NULL;
214 fTimeLimit = 0;
215 fTime = 0;
216 fTickCounter = 0;
217 fStackLimit = DEFAULT_BACKTRACK_STACK_CAPACITY;
218 fCallbackFn = NULL;
219 fCallbackContext = NULL;
220 fFindProgressCallbackFn = NULL;
221 fFindProgressCallbackContext = NULL;
222 fTraceDebug = FALSE;
223 fDeferredStatus = status;
224 fData = fSmallData;
225 fWordBreakItr = NULL;
226 fGCBreakItr = NULL;
227
228 fStack = NULL;
229 fInputText = NULL;
230 fAltInputText = NULL;
231 fInput = NULL;
232 fInputLength = 0;
233 fInputUniStrMaybeMutable = FALSE;
234 }
235
236 //
237 // init2() Common initialization for use by RegexMatcher constructors, part 2.
238 // This handles the common setup to be done after the Pattern is available.
239 //
init2(UText * input,UErrorCode & status)240 void RegexMatcher::init2(UText *input, UErrorCode &status) {
241 if (U_FAILURE(status)) {
242 fDeferredStatus = status;
243 return;
244 }
245
246 if (fPattern->fDataSize > UPRV_LENGTHOF(fSmallData)) {
247 fData = (int64_t *)uprv_malloc(fPattern->fDataSize * sizeof(int64_t));
248 if (fData == NULL) {
249 status = fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
250 return;
251 }
252 }
253
254 fStack = new UVector64(status);
255 if (fStack == NULL) {
256 status = fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
257 return;
258 }
259
260 reset(input);
261 setStackLimit(DEFAULT_BACKTRACK_STACK_CAPACITY, status);
262 if (U_FAILURE(status)) {
263 fDeferredStatus = status;
264 return;
265 }
266 }
267
268
269 static const UChar BACKSLASH = 0x5c;
270 static const UChar DOLLARSIGN = 0x24;
271 static const UChar LEFTBRACKET = 0x7b;
272 static const UChar RIGHTBRACKET = 0x7d;
273
274 //--------------------------------------------------------------------------------
275 //
276 // appendReplacement
277 //
278 //--------------------------------------------------------------------------------
appendReplacement(UnicodeString & dest,const UnicodeString & replacement,UErrorCode & status)279 RegexMatcher &RegexMatcher::appendReplacement(UnicodeString &dest,
280 const UnicodeString &replacement,
281 UErrorCode &status) {
282 UText replacementText = UTEXT_INITIALIZER;
283
284 utext_openConstUnicodeString(&replacementText, &replacement, &status);
285 if (U_SUCCESS(status)) {
286 UText resultText = UTEXT_INITIALIZER;
287 utext_openUnicodeString(&resultText, &dest, &status);
288
289 if (U_SUCCESS(status)) {
290 appendReplacement(&resultText, &replacementText, status);
291 utext_close(&resultText);
292 }
293 utext_close(&replacementText);
294 }
295
296 return *this;
297 }
298
299 //
300 // appendReplacement, UText mode
301 //
appendReplacement(UText * dest,UText * replacement,UErrorCode & status)302 RegexMatcher &RegexMatcher::appendReplacement(UText *dest,
303 UText *replacement,
304 UErrorCode &status) {
305 if (U_FAILURE(status)) {
306 return *this;
307 }
308 if (U_FAILURE(fDeferredStatus)) {
309 status = fDeferredStatus;
310 return *this;
311 }
312 if (fMatch == FALSE) {
313 status = U_REGEX_INVALID_STATE;
314 return *this;
315 }
316
317 // Copy input string from the end of previous match to start of current match
318 int64_t destLen = utext_nativeLength(dest);
319 if (fMatchStart > fAppendPosition) {
320 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
321 destLen += utext_replace(dest, destLen, destLen, fInputText->chunkContents+fAppendPosition,
322 (int32_t)(fMatchStart-fAppendPosition), &status);
323 } else {
324 int32_t len16;
325 if (UTEXT_USES_U16(fInputText)) {
326 len16 = (int32_t)(fMatchStart-fAppendPosition);
327 } else {
328 UErrorCode lengthStatus = U_ZERO_ERROR;
329 len16 = utext_extract(fInputText, fAppendPosition, fMatchStart, NULL, 0, &lengthStatus);
330 }
331 UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1));
332 if (inputChars == NULL) {
333 status = U_MEMORY_ALLOCATION_ERROR;
334 return *this;
335 }
336 utext_extract(fInputText, fAppendPosition, fMatchStart, inputChars, len16+1, &status);
337 destLen += utext_replace(dest, destLen, destLen, inputChars, len16, &status);
338 uprv_free(inputChars);
339 }
340 }
341 fAppendPosition = fMatchEnd;
342
343
344 // scan the replacement text, looking for substitutions ($n) and \escapes.
345 // TODO: optimize this loop by efficiently scanning for '$' or '\',
346 // move entire ranges not containing substitutions.
347 UTEXT_SETNATIVEINDEX(replacement, 0);
348 for (UChar32 c = UTEXT_NEXT32(replacement); U_SUCCESS(status) && c != U_SENTINEL; c = UTEXT_NEXT32(replacement)) {
349 if (c == BACKSLASH) {
350 // Backslash Escape. Copy the following char out without further checks.
351 // Note: Surrogate pairs don't need any special handling
352 // The second half wont be a '$' or a '\', and
353 // will move to the dest normally on the next
354 // loop iteration.
355 c = UTEXT_CURRENT32(replacement);
356 if (c == U_SENTINEL) {
357 break;
358 }
359
360 if (c==0x55/*U*/ || c==0x75/*u*/) {
361 // We have a \udddd or \Udddddddd escape sequence.
362 int32_t offset = 0;
363 struct URegexUTextUnescapeCharContext context = U_REGEX_UTEXT_UNESCAPE_CONTEXT(replacement);
364 UChar32 escapedChar = u_unescapeAt(uregex_utext_unescape_charAt, &offset, INT32_MAX, &context);
365 if (escapedChar != (UChar32)0xFFFFFFFF) {
366 if (U_IS_BMP(escapedChar)) {
367 UChar c16 = (UChar)escapedChar;
368 destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
369 } else {
370 UChar surrogate[2];
371 surrogate[0] = U16_LEAD(escapedChar);
372 surrogate[1] = U16_TRAIL(escapedChar);
373 if (U_SUCCESS(status)) {
374 destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);
375 }
376 }
377 // TODO: Report errors for mal-formed \u escapes?
378 // As this is, the original sequence is output, which may be OK.
379 if (context.lastOffset == offset) {
380 (void)UTEXT_PREVIOUS32(replacement);
381 } else if (context.lastOffset != offset-1) {
382 utext_moveIndex32(replacement, offset - context.lastOffset - 1);
383 }
384 }
385 } else {
386 (void)UTEXT_NEXT32(replacement);
387 // Plain backslash escape. Just put out the escaped character.
388 if (U_IS_BMP(c)) {
389 UChar c16 = (UChar)c;
390 destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
391 } else {
392 UChar surrogate[2];
393 surrogate[0] = U16_LEAD(c);
394 surrogate[1] = U16_TRAIL(c);
395 if (U_SUCCESS(status)) {
396 destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);
397 }
398 }
399 }
400 } else if (c != DOLLARSIGN) {
401 // Normal char, not a $. Copy it out without further checks.
402 if (U_IS_BMP(c)) {
403 UChar c16 = (UChar)c;
404 destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
405 } else {
406 UChar surrogate[2];
407 surrogate[0] = U16_LEAD(c);
408 surrogate[1] = U16_TRAIL(c);
409 if (U_SUCCESS(status)) {
410 destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);
411 }
412 }
413 } else {
414 // We've got a $. Pick up a capture group name or number if one follows.
415 // Consume digits so long as the resulting group number <= the number of
416 // number of capture groups in the pattern.
417
418 int32_t groupNum = 0;
419 int32_t numDigits = 0;
420 UChar32 nextChar = utext_current32(replacement);
421 if (nextChar == LEFTBRACKET) {
422 // Scan for a Named Capture Group, ${name}.
423 UnicodeString groupName;
424 utext_next32(replacement);
425 while(U_SUCCESS(status) && nextChar != RIGHTBRACKET) {
426 nextChar = utext_next32(replacement);
427 if (nextChar == U_SENTINEL) {
428 status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
429 } else if ((nextChar >= 0x41 && nextChar <= 0x5a) || // A..Z
430 (nextChar >= 0x61 && nextChar <= 0x7a) || // a..z
431 (nextChar >= 0x31 && nextChar <= 0x39)) { // 0..9
432 groupName.append(nextChar);
433 } else if (nextChar == RIGHTBRACKET) {
434 groupNum = fPattern->fNamedCaptureMap ? uhash_geti(fPattern->fNamedCaptureMap, &groupName) : 0;
435 if (groupNum == 0) {
436 status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
437 }
438 } else {
439 // Character was something other than a name char or a closing '}'
440 status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
441 }
442 }
443
444 } else if (u_isdigit(nextChar)) {
445 // $n Scan for a capture group number
446 int32_t numCaptureGroups = fPattern->fGroupMap->size();
447 for (;;) {
448 nextChar = UTEXT_CURRENT32(replacement);
449 if (nextChar == U_SENTINEL) {
450 break;
451 }
452 if (u_isdigit(nextChar) == FALSE) {
453 break;
454 }
455 int32_t nextDigitVal = u_charDigitValue(nextChar);
456 if (groupNum*10 + nextDigitVal > numCaptureGroups) {
457 // Don't consume the next digit if it makes the capture group number too big.
458 if (numDigits == 0) {
459 status = U_INDEX_OUTOFBOUNDS_ERROR;
460 }
461 break;
462 }
463 (void)UTEXT_NEXT32(replacement);
464 groupNum=groupNum*10 + nextDigitVal;
465 ++numDigits;
466 }
467 } else {
468 // $ not followed by capture group name or number.
469 status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
470 }
471
472 if (U_SUCCESS(status)) {
473 destLen += appendGroup(groupNum, dest, status);
474 }
475 } // End of $ capture group handling
476 } // End of per-character loop through the replacement string.
477
478 return *this;
479 }
480
481
482
483 //--------------------------------------------------------------------------------
484 //
485 // appendTail Intended to be used in conjunction with appendReplacement()
486 // To the destination string, append everything following
487 // the last match position from the input string.
488 //
489 // Note: Match ranges do not affect appendTail or appendReplacement
490 //
491 //--------------------------------------------------------------------------------
appendTail(UnicodeString & dest)492 UnicodeString &RegexMatcher::appendTail(UnicodeString &dest) {
493 UErrorCode status = U_ZERO_ERROR;
494 UText resultText = UTEXT_INITIALIZER;
495 utext_openUnicodeString(&resultText, &dest, &status);
496
497 if (U_SUCCESS(status)) {
498 appendTail(&resultText, status);
499 utext_close(&resultText);
500 }
501
502 return dest;
503 }
504
505 //
506 // appendTail, UText mode
507 //
appendTail(UText * dest,UErrorCode & status)508 UText *RegexMatcher::appendTail(UText *dest, UErrorCode &status) {
509 if (U_FAILURE(status)) {
510 return dest;
511 }
512 if (U_FAILURE(fDeferredStatus)) {
513 status = fDeferredStatus;
514 return dest;
515 }
516
517 if (fInputLength > fAppendPosition) {
518 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
519 int64_t destLen = utext_nativeLength(dest);
520 utext_replace(dest, destLen, destLen, fInputText->chunkContents+fAppendPosition,
521 (int32_t)(fInputLength-fAppendPosition), &status);
522 } else {
523 int32_t len16;
524 if (UTEXT_USES_U16(fInputText)) {
525 len16 = (int32_t)(fInputLength-fAppendPosition);
526 } else {
527 len16 = utext_extract(fInputText, fAppendPosition, fInputLength, NULL, 0, &status);
528 status = U_ZERO_ERROR; // buffer overflow
529 }
530
531 UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16));
532 if (inputChars == NULL) {
533 fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
534 } else {
535 utext_extract(fInputText, fAppendPosition, fInputLength, inputChars, len16, &status); // unterminated
536 int64_t destLen = utext_nativeLength(dest);
537 utext_replace(dest, destLen, destLen, inputChars, len16, &status);
538 uprv_free(inputChars);
539 }
540 }
541 }
542 return dest;
543 }
544
545
546
547 //--------------------------------------------------------------------------------
548 //
549 // end
550 //
551 //--------------------------------------------------------------------------------
end(UErrorCode & err) const552 int32_t RegexMatcher::end(UErrorCode &err) const {
553 return end(0, err);
554 }
555
end64(UErrorCode & err) const556 int64_t RegexMatcher::end64(UErrorCode &err) const {
557 return end64(0, err);
558 }
559
end64(int32_t group,UErrorCode & err) const560 int64_t RegexMatcher::end64(int32_t group, UErrorCode &err) const {
561 if (U_FAILURE(err)) {
562 return -1;
563 }
564 if (fMatch == FALSE) {
565 err = U_REGEX_INVALID_STATE;
566 return -1;
567 }
568 if (group < 0 || group > fPattern->fGroupMap->size()) {
569 err = U_INDEX_OUTOFBOUNDS_ERROR;
570 return -1;
571 }
572 int64_t e = -1;
573 if (group == 0) {
574 e = fMatchEnd;
575 } else {
576 // Get the position within the stack frame of the variables for
577 // this capture group.
578 int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1);
579 U_ASSERT(groupOffset < fPattern->fFrameSize);
580 U_ASSERT(groupOffset >= 0);
581 e = fFrame->fExtra[groupOffset + 1];
582 }
583
584 return e;
585 }
586
end(int32_t group,UErrorCode & err) const587 int32_t RegexMatcher::end(int32_t group, UErrorCode &err) const {
588 return (int32_t)end64(group, err);
589 }
590
591 //--------------------------------------------------------------------------------
592 //
593 // findProgressInterrupt This function is called once for each advance in the target
594 // string from the find() function, and calls the user progress callback
595 // function if there is one installed.
596 //
597 // Return: TRUE if the find operation is to be terminated.
598 // FALSE if the find operation is to continue running.
599 //
600 //--------------------------------------------------------------------------------
findProgressInterrupt(int64_t pos,UErrorCode & status)601 UBool RegexMatcher::findProgressInterrupt(int64_t pos, UErrorCode &status) {
602 if (fFindProgressCallbackFn && !(*fFindProgressCallbackFn)(fFindProgressCallbackContext, pos)) {
603 status = U_REGEX_STOPPED_BY_CALLER;
604 return TRUE;
605 }
606 return FALSE;
607 }
608
609 //--------------------------------------------------------------------------------
610 //
611 // find()
612 //
613 //--------------------------------------------------------------------------------
find()614 UBool RegexMatcher::find() {
615 if (U_FAILURE(fDeferredStatus)) {
616 return FALSE;
617 }
618 UErrorCode status = U_ZERO_ERROR;
619 UBool result = find(status);
620 return result;
621 }
622
623 //--------------------------------------------------------------------------------
624 //
625 // find()
626 //
627 //--------------------------------------------------------------------------------
find(UErrorCode & status)628 UBool RegexMatcher::find(UErrorCode &status) {
629 // Start at the position of the last match end. (Will be zero if the
630 // matcher has been reset.)
631 //
632 if (U_FAILURE(status)) {
633 return FALSE;
634 }
635 if (U_FAILURE(fDeferredStatus)) {
636 status = fDeferredStatus;
637 return FALSE;
638 }
639
640 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
641 return findUsingChunk(status);
642 }
643
644 int64_t startPos = fMatchEnd;
645 if (startPos==0) {
646 startPos = fActiveStart;
647 }
648
649 if (fMatch) {
650 // Save the position of any previous successful match.
651 fLastMatchEnd = fMatchEnd;
652
653 if (fMatchStart == fMatchEnd) {
654 // Previous match had zero length. Move start position up one position
655 // to avoid sending find() into a loop on zero-length matches.
656 if (startPos >= fActiveLimit) {
657 fMatch = FALSE;
658 fHitEnd = TRUE;
659 return FALSE;
660 }
661 UTEXT_SETNATIVEINDEX(fInputText, startPos);
662 (void)UTEXT_NEXT32(fInputText);
663 startPos = UTEXT_GETNATIVEINDEX(fInputText);
664 }
665 } else {
666 if (fLastMatchEnd >= 0) {
667 // A previous find() failed to match. Don't try again.
668 // (without this test, a pattern with a zero-length match
669 // could match again at the end of an input string.)
670 fHitEnd = TRUE;
671 return FALSE;
672 }
673 }
674
675
676 // Compute the position in the input string beyond which a match can not begin, because
677 // the minimum length match would extend past the end of the input.
678 // Note: some patterns that cannot match anything will have fMinMatchLength==Max Int.
679 // Be aware of possible overflows if making changes here.
680 int64_t testStartLimit;
681 if (UTEXT_USES_U16(fInputText)) {
682 testStartLimit = fActiveLimit - fPattern->fMinMatchLen;
683 if (startPos > testStartLimit) {
684 fMatch = FALSE;
685 fHitEnd = TRUE;
686 return FALSE;
687 }
688 } else {
689 // We don't know exactly how long the minimum match length is in native characters.
690 // Treat anything > 0 as 1.
691 testStartLimit = fActiveLimit - (fPattern->fMinMatchLen > 0 ? 1 : 0);
692 }
693
694 UChar32 c;
695 U_ASSERT(startPos >= 0);
696
697 switch (fPattern->fStartType) {
698 case START_NO_INFO:
699 // No optimization was found.
700 // Try a match at each input position.
701 for (;;) {
702 MatchAt(startPos, FALSE, status);
703 if (U_FAILURE(status)) {
704 return FALSE;
705 }
706 if (fMatch) {
707 return TRUE;
708 }
709 if (startPos >= testStartLimit) {
710 fHitEnd = TRUE;
711 return FALSE;
712 }
713 UTEXT_SETNATIVEINDEX(fInputText, startPos);
714 (void)UTEXT_NEXT32(fInputText);
715 startPos = UTEXT_GETNATIVEINDEX(fInputText);
716 // Note that it's perfectly OK for a pattern to have a zero-length
717 // match at the end of a string, so we must make sure that the loop
718 // runs with startPos == testStartLimit the last time through.
719 if (findProgressInterrupt(startPos, status))
720 return FALSE;
721 }
722 UPRV_UNREACHABLE_EXIT;
723
724 case START_START:
725 // Matches are only possible at the start of the input string
726 // (pattern begins with ^ or \A)
727 if (startPos > fActiveStart) {
728 fMatch = FALSE;
729 return FALSE;
730 }
731 MatchAt(startPos, FALSE, status);
732 if (U_FAILURE(status)) {
733 return FALSE;
734 }
735 return fMatch;
736
737
738 case START_SET:
739 {
740 // Match may start on any char from a pre-computed set.
741 U_ASSERT(fPattern->fMinMatchLen > 0);
742 UTEXT_SETNATIVEINDEX(fInputText, startPos);
743 for (;;) {
744 int64_t pos = startPos;
745 c = UTEXT_NEXT32(fInputText);
746 startPos = UTEXT_GETNATIVEINDEX(fInputText);
747 // c will be -1 (U_SENTINEL) at end of text, in which case we
748 // skip this next block (so we don't have a negative array index)
749 // and handle end of text in the following block.
750 if (c >= 0 && ((c<256 && fPattern->fInitialChars8->contains(c)) ||
751 (c>=256 && fPattern->fInitialChars->contains(c)))) {
752 MatchAt(pos, FALSE, status);
753 if (U_FAILURE(status)) {
754 return FALSE;
755 }
756 if (fMatch) {
757 return TRUE;
758 }
759 UTEXT_SETNATIVEINDEX(fInputText, pos);
760 }
761 if (startPos > testStartLimit) {
762 fMatch = FALSE;
763 fHitEnd = TRUE;
764 return FALSE;
765 }
766 if (findProgressInterrupt(startPos, status))
767 return FALSE;
768 }
769 }
770 UPRV_UNREACHABLE_EXIT;
771
772 case START_STRING:
773 case START_CHAR:
774 {
775 // Match starts on exactly one char.
776 U_ASSERT(fPattern->fMinMatchLen > 0);
777 UChar32 theChar = fPattern->fInitialChar;
778 UTEXT_SETNATIVEINDEX(fInputText, startPos);
779 for (;;) {
780 int64_t pos = startPos;
781 c = UTEXT_NEXT32(fInputText);
782 startPos = UTEXT_GETNATIVEINDEX(fInputText);
783 if (c == theChar) {
784 MatchAt(pos, FALSE, status);
785 if (U_FAILURE(status)) {
786 return FALSE;
787 }
788 if (fMatch) {
789 return TRUE;
790 }
791 UTEXT_SETNATIVEINDEX(fInputText, startPos);
792 }
793 if (startPos > testStartLimit) {
794 fMatch = FALSE;
795 fHitEnd = TRUE;
796 return FALSE;
797 }
798 if (findProgressInterrupt(startPos, status))
799 return FALSE;
800 }
801 }
802 UPRV_UNREACHABLE_EXIT;
803
804 case START_LINE:
805 {
806 UChar32 ch;
807 if (startPos == fAnchorStart) {
808 MatchAt(startPos, FALSE, status);
809 if (U_FAILURE(status)) {
810 return FALSE;
811 }
812 if (fMatch) {
813 return TRUE;
814 }
815 UTEXT_SETNATIVEINDEX(fInputText, startPos);
816 ch = UTEXT_NEXT32(fInputText);
817 startPos = UTEXT_GETNATIVEINDEX(fInputText);
818 } else {
819 UTEXT_SETNATIVEINDEX(fInputText, startPos);
820 ch = UTEXT_PREVIOUS32(fInputText);
821 UTEXT_SETNATIVEINDEX(fInputText, startPos);
822 }
823
824 if (fPattern->fFlags & UREGEX_UNIX_LINES) {
825 for (;;) {
826 if (ch == 0x0a) {
827 MatchAt(startPos, FALSE, status);
828 if (U_FAILURE(status)) {
829 return FALSE;
830 }
831 if (fMatch) {
832 return TRUE;
833 }
834 UTEXT_SETNATIVEINDEX(fInputText, startPos);
835 }
836 if (startPos >= testStartLimit) {
837 fMatch = FALSE;
838 fHitEnd = TRUE;
839 return FALSE;
840 }
841 ch = UTEXT_NEXT32(fInputText);
842 startPos = UTEXT_GETNATIVEINDEX(fInputText);
843 // Note that it's perfectly OK for a pattern to have a zero-length
844 // match at the end of a string, so we must make sure that the loop
845 // runs with startPos == testStartLimit the last time through.
846 if (findProgressInterrupt(startPos, status))
847 return FALSE;
848 }
849 } else {
850 for (;;) {
851 if (isLineTerminator(ch)) {
852 if (ch == 0x0d && startPos < fActiveLimit && UTEXT_CURRENT32(fInputText) == 0x0a) {
853 (void)UTEXT_NEXT32(fInputText);
854 startPos = UTEXT_GETNATIVEINDEX(fInputText);
855 }
856 MatchAt(startPos, FALSE, status);
857 if (U_FAILURE(status)) {
858 return FALSE;
859 }
860 if (fMatch) {
861 return TRUE;
862 }
863 UTEXT_SETNATIVEINDEX(fInputText, startPos);
864 }
865 if (startPos >= testStartLimit) {
866 fMatch = FALSE;
867 fHitEnd = TRUE;
868 return FALSE;
869 }
870 ch = UTEXT_NEXT32(fInputText);
871 startPos = UTEXT_GETNATIVEINDEX(fInputText);
872 // Note that it's perfectly OK for a pattern to have a zero-length
873 // match at the end of a string, so we must make sure that the loop
874 // runs with startPos == testStartLimit the last time through.
875 if (findProgressInterrupt(startPos, status))
876 return FALSE;
877 }
878 }
879 }
880
881 default:
882 UPRV_UNREACHABLE_ASSERT;
883 // Unknown value in fPattern->fStartType, should be from StartOfMatch enum. But
884 // we have reports of this in production code, don't use UPRV_UNREACHABLE_EXIT.
885 // See ICU-21669.
886 status = U_INTERNAL_PROGRAM_ERROR;
887 return FALSE;
888 }
889
890 UPRV_UNREACHABLE_EXIT;
891 }
892
893
894
find(int64_t start,UErrorCode & status)895 UBool RegexMatcher::find(int64_t start, UErrorCode &status) {
896 if (U_FAILURE(status)) {
897 return FALSE;
898 }
899 if (U_FAILURE(fDeferredStatus)) {
900 status = fDeferredStatus;
901 return FALSE;
902 }
903 this->reset(); // Note: Reset() is specified by Java Matcher documentation.
904 // This will reset the region to be the full input length.
905 if (start < 0) {
906 status = U_INDEX_OUTOFBOUNDS_ERROR;
907 return FALSE;
908 }
909
910 int64_t nativeStart = start;
911 if (nativeStart < fActiveStart || nativeStart > fActiveLimit) {
912 status = U_INDEX_OUTOFBOUNDS_ERROR;
913 return FALSE;
914 }
915 fMatchEnd = nativeStart;
916 return find(status);
917 }
918
919
920 //--------------------------------------------------------------------------------
921 //
922 // findUsingChunk() -- like find(), but with the advance knowledge that the
923 // entire string is available in the UText's chunk buffer.
924 //
925 //--------------------------------------------------------------------------------
findUsingChunk(UErrorCode & status)926 UBool RegexMatcher::findUsingChunk(UErrorCode &status) {
927 // Start at the position of the last match end. (Will be zero if the
928 // matcher has been reset.
929 //
930
931 int32_t startPos = (int32_t)fMatchEnd;
932 if (startPos==0) {
933 startPos = (int32_t)fActiveStart;
934 }
935
936 const UChar *inputBuf = fInputText->chunkContents;
937
938 if (fMatch) {
939 // Save the position of any previous successful match.
940 fLastMatchEnd = fMatchEnd;
941
942 if (fMatchStart == fMatchEnd) {
943 // Previous match had zero length. Move start position up one position
944 // to avoid sending find() into a loop on zero-length matches.
945 if (startPos >= fActiveLimit) {
946 fMatch = FALSE;
947 fHitEnd = TRUE;
948 return FALSE;
949 }
950 U16_FWD_1(inputBuf, startPos, fInputLength);
951 }
952 } else {
953 if (fLastMatchEnd >= 0) {
954 // A previous find() failed to match. Don't try again.
955 // (without this test, a pattern with a zero-length match
956 // could match again at the end of an input string.)
957 fHitEnd = TRUE;
958 return FALSE;
959 }
960 }
961
962
963 // Compute the position in the input string beyond which a match can not begin, because
964 // the minimum length match would extend past the end of the input.
965 // Note: some patterns that cannot match anything will have fMinMatchLength==Max Int.
966 // Be aware of possible overflows if making changes here.
967 // Note: a match can begin at inputBuf + testLen; it is an inclusive limit.
968 int32_t testLen = (int32_t)(fActiveLimit - fPattern->fMinMatchLen);
969 if (startPos > testLen) {
970 fMatch = FALSE;
971 fHitEnd = TRUE;
972 return FALSE;
973 }
974
975 UChar32 c;
976 U_ASSERT(startPos >= 0);
977
978 switch (fPattern->fStartType) {
979 case START_NO_INFO:
980 // No optimization was found.
981 // Try a match at each input position.
982 for (;;) {
983 MatchChunkAt(startPos, FALSE, status);
984 if (U_FAILURE(status)) {
985 return FALSE;
986 }
987 if (fMatch) {
988 return TRUE;
989 }
990 if (startPos >= testLen) {
991 fHitEnd = TRUE;
992 return FALSE;
993 }
994 U16_FWD_1(inputBuf, startPos, fActiveLimit);
995 // Note that it's perfectly OK for a pattern to have a zero-length
996 // match at the end of a string, so we must make sure that the loop
997 // runs with startPos == testLen the last time through.
998 if (findProgressInterrupt(startPos, status))
999 return FALSE;
1000 }
1001 UPRV_UNREACHABLE_EXIT;
1002
1003 case START_START:
1004 // Matches are only possible at the start of the input string
1005 // (pattern begins with ^ or \A)
1006 if (startPos > fActiveStart) {
1007 fMatch = FALSE;
1008 return FALSE;
1009 }
1010 MatchChunkAt(startPos, FALSE, status);
1011 if (U_FAILURE(status)) {
1012 return FALSE;
1013 }
1014 return fMatch;
1015
1016
1017 case START_SET:
1018 {
1019 // Match may start on any char from a pre-computed set.
1020 U_ASSERT(fPattern->fMinMatchLen > 0);
1021 for (;;) {
1022 int32_t pos = startPos;
1023 U16_NEXT(inputBuf, startPos, fActiveLimit, c); // like c = inputBuf[startPos++];
1024 if ((c<256 && fPattern->fInitialChars8->contains(c)) ||
1025 (c>=256 && fPattern->fInitialChars->contains(c))) {
1026 MatchChunkAt(pos, FALSE, status);
1027 if (U_FAILURE(status)) {
1028 return FALSE;
1029 }
1030 if (fMatch) {
1031 return TRUE;
1032 }
1033 }
1034 if (startPos > testLen) {
1035 fMatch = FALSE;
1036 fHitEnd = TRUE;
1037 return FALSE;
1038 }
1039 if (findProgressInterrupt(startPos, status))
1040 return FALSE;
1041 }
1042 }
1043 UPRV_UNREACHABLE_EXIT;
1044
1045 case START_STRING:
1046 case START_CHAR:
1047 {
1048 // Match starts on exactly one char.
1049 U_ASSERT(fPattern->fMinMatchLen > 0);
1050 UChar32 theChar = fPattern->fInitialChar;
1051 for (;;) {
1052 int32_t pos = startPos;
1053 U16_NEXT(inputBuf, startPos, fActiveLimit, c); // like c = inputBuf[startPos++];
1054 if (c == theChar) {
1055 MatchChunkAt(pos, FALSE, status);
1056 if (U_FAILURE(status)) {
1057 return FALSE;
1058 }
1059 if (fMatch) {
1060 return TRUE;
1061 }
1062 }
1063 if (startPos > testLen) {
1064 fMatch = FALSE;
1065 fHitEnd = TRUE;
1066 return FALSE;
1067 }
1068 if (findProgressInterrupt(startPos, status))
1069 return FALSE;
1070 }
1071 }
1072 UPRV_UNREACHABLE_EXIT;
1073
1074 case START_LINE:
1075 {
1076 UChar32 ch;
1077 if (startPos == fAnchorStart) {
1078 MatchChunkAt(startPos, FALSE, status);
1079 if (U_FAILURE(status)) {
1080 return FALSE;
1081 }
1082 if (fMatch) {
1083 return TRUE;
1084 }
1085 U16_FWD_1(inputBuf, startPos, fActiveLimit);
1086 }
1087
1088 if (fPattern->fFlags & UREGEX_UNIX_LINES) {
1089 for (;;) {
1090 ch = inputBuf[startPos-1];
1091 if (ch == 0x0a) {
1092 MatchChunkAt(startPos, FALSE, status);
1093 if (U_FAILURE(status)) {
1094 return FALSE;
1095 }
1096 if (fMatch) {
1097 return TRUE;
1098 }
1099 }
1100 if (startPos >= testLen) {
1101 fMatch = FALSE;
1102 fHitEnd = TRUE;
1103 return FALSE;
1104 }
1105 U16_FWD_1(inputBuf, startPos, fActiveLimit);
1106 // Note that it's perfectly OK for a pattern to have a zero-length
1107 // match at the end of a string, so we must make sure that the loop
1108 // runs with startPos == testLen the last time through.
1109 if (findProgressInterrupt(startPos, status))
1110 return FALSE;
1111 }
1112 } else {
1113 for (;;) {
1114 ch = inputBuf[startPos-1];
1115 if (isLineTerminator(ch)) {
1116 if (ch == 0x0d && startPos < fActiveLimit && inputBuf[startPos] == 0x0a) {
1117 startPos++;
1118 }
1119 MatchChunkAt(startPos, FALSE, status);
1120 if (U_FAILURE(status)) {
1121 return FALSE;
1122 }
1123 if (fMatch) {
1124 return TRUE;
1125 }
1126 }
1127 if (startPos >= testLen) {
1128 fMatch = FALSE;
1129 fHitEnd = TRUE;
1130 return FALSE;
1131 }
1132 U16_FWD_1(inputBuf, startPos, fActiveLimit);
1133 // Note that it's perfectly OK for a pattern to have a zero-length
1134 // match at the end of a string, so we must make sure that the loop
1135 // runs with startPos == testLen the last time through.
1136 if (findProgressInterrupt(startPos, status))
1137 return FALSE;
1138 }
1139 }
1140 }
1141
1142 default:
1143 UPRV_UNREACHABLE_ASSERT;
1144 // Unknown value in fPattern->fStartType, should be from StartOfMatch enum. But
1145 // we have reports of this in production code, don't use UPRV_UNREACHABLE_EXIT.
1146 // See ICU-21669.
1147 status = U_INTERNAL_PROGRAM_ERROR;
1148 return FALSE;
1149 }
1150
1151 UPRV_UNREACHABLE_EXIT;
1152 }
1153
1154
1155
1156 //--------------------------------------------------------------------------------
1157 //
1158 // group()
1159 //
1160 //--------------------------------------------------------------------------------
group(UErrorCode & status) const1161 UnicodeString RegexMatcher::group(UErrorCode &status) const {
1162 return group(0, status);
1163 }
1164
1165 // Return immutable shallow clone
group(UText * dest,int64_t & group_len,UErrorCode & status) const1166 UText *RegexMatcher::group(UText *dest, int64_t &group_len, UErrorCode &status) const {
1167 return group(0, dest, group_len, status);
1168 }
1169
1170 // Return immutable shallow clone
group(int32_t groupNum,UText * dest,int64_t & group_len,UErrorCode & status) const1171 UText *RegexMatcher::group(int32_t groupNum, UText *dest, int64_t &group_len, UErrorCode &status) const {
1172 group_len = 0;
1173 if (U_FAILURE(status)) {
1174 return dest;
1175 }
1176 if (U_FAILURE(fDeferredStatus)) {
1177 status = fDeferredStatus;
1178 } else if (fMatch == FALSE) {
1179 status = U_REGEX_INVALID_STATE;
1180 } else if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) {
1181 status = U_INDEX_OUTOFBOUNDS_ERROR;
1182 }
1183
1184 if (U_FAILURE(status)) {
1185 return dest;
1186 }
1187
1188 int64_t s, e;
1189 if (groupNum == 0) {
1190 s = fMatchStart;
1191 e = fMatchEnd;
1192 } else {
1193 int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1);
1194 U_ASSERT(groupOffset < fPattern->fFrameSize);
1195 U_ASSERT(groupOffset >= 0);
1196 s = fFrame->fExtra[groupOffset];
1197 e = fFrame->fExtra[groupOffset+1];
1198 }
1199
1200 if (s < 0) {
1201 // A capture group wasn't part of the match
1202 return utext_clone(dest, fInputText, FALSE, TRUE, &status);
1203 }
1204 U_ASSERT(s <= e);
1205 group_len = e - s;
1206
1207 dest = utext_clone(dest, fInputText, FALSE, TRUE, &status);
1208 if (dest)
1209 UTEXT_SETNATIVEINDEX(dest, s);
1210 return dest;
1211 }
1212
group(int32_t groupNum,UErrorCode & status) const1213 UnicodeString RegexMatcher::group(int32_t groupNum, UErrorCode &status) const {
1214 UnicodeString result;
1215 int64_t groupStart = start64(groupNum, status);
1216 int64_t groupEnd = end64(groupNum, status);
1217 if (U_FAILURE(status) || groupStart == -1 || groupStart == groupEnd) {
1218 return result;
1219 }
1220
1221 // Get the group length using a utext_extract preflight.
1222 // UText is actually pretty efficient at this when underlying encoding is UTF-16.
1223 int32_t length = utext_extract(fInputText, groupStart, groupEnd, NULL, 0, &status);
1224 if (status != U_BUFFER_OVERFLOW_ERROR) {
1225 return result;
1226 }
1227
1228 status = U_ZERO_ERROR;
1229 UChar *buf = result.getBuffer(length);
1230 if (buf == NULL) {
1231 status = U_MEMORY_ALLOCATION_ERROR;
1232 } else {
1233 int32_t extractLength = utext_extract(fInputText, groupStart, groupEnd, buf, length, &status);
1234 result.releaseBuffer(extractLength);
1235 U_ASSERT(length == extractLength);
1236 }
1237 return result;
1238 }
1239
1240
1241 //--------------------------------------------------------------------------------
1242 //
1243 // appendGroup() -- currently internal only, appends a group to a UText rather
1244 // than replacing its contents
1245 //
1246 //--------------------------------------------------------------------------------
1247
appendGroup(int32_t groupNum,UText * dest,UErrorCode & status) const1248 int64_t RegexMatcher::appendGroup(int32_t groupNum, UText *dest, UErrorCode &status) const {
1249 if (U_FAILURE(status)) {
1250 return 0;
1251 }
1252 if (U_FAILURE(fDeferredStatus)) {
1253 status = fDeferredStatus;
1254 return 0;
1255 }
1256 int64_t destLen = utext_nativeLength(dest);
1257
1258 if (fMatch == FALSE) {
1259 status = U_REGEX_INVALID_STATE;
1260 return utext_replace(dest, destLen, destLen, NULL, 0, &status);
1261 }
1262 if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) {
1263 status = U_INDEX_OUTOFBOUNDS_ERROR;
1264 return utext_replace(dest, destLen, destLen, NULL, 0, &status);
1265 }
1266
1267 int64_t s, e;
1268 if (groupNum == 0) {
1269 s = fMatchStart;
1270 e = fMatchEnd;
1271 } else {
1272 int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1);
1273 U_ASSERT(groupOffset < fPattern->fFrameSize);
1274 U_ASSERT(groupOffset >= 0);
1275 s = fFrame->fExtra[groupOffset];
1276 e = fFrame->fExtra[groupOffset+1];
1277 }
1278
1279 if (s < 0) {
1280 // A capture group wasn't part of the match
1281 return utext_replace(dest, destLen, destLen, NULL, 0, &status);
1282 }
1283 U_ASSERT(s <= e);
1284
1285 int64_t deltaLen;
1286 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1287 U_ASSERT(e <= fInputLength);
1288 deltaLen = utext_replace(dest, destLen, destLen, fInputText->chunkContents+s, (int32_t)(e-s), &status);
1289 } else {
1290 int32_t len16;
1291 if (UTEXT_USES_U16(fInputText)) {
1292 len16 = (int32_t)(e-s);
1293 } else {
1294 UErrorCode lengthStatus = U_ZERO_ERROR;
1295 len16 = utext_extract(fInputText, s, e, NULL, 0, &lengthStatus);
1296 }
1297 UChar *groupChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1));
1298 if (groupChars == NULL) {
1299 status = U_MEMORY_ALLOCATION_ERROR;
1300 return 0;
1301 }
1302 utext_extract(fInputText, s, e, groupChars, len16+1, &status);
1303
1304 deltaLen = utext_replace(dest, destLen, destLen, groupChars, len16, &status);
1305 uprv_free(groupChars);
1306 }
1307 return deltaLen;
1308 }
1309
1310
1311
1312 //--------------------------------------------------------------------------------
1313 //
1314 // groupCount()
1315 //
1316 //--------------------------------------------------------------------------------
groupCount() const1317 int32_t RegexMatcher::groupCount() const {
1318 return fPattern->fGroupMap->size();
1319 }
1320
1321 //--------------------------------------------------------------------------------
1322 //
1323 // hasAnchoringBounds()
1324 //
1325 //--------------------------------------------------------------------------------
hasAnchoringBounds() const1326 UBool RegexMatcher::hasAnchoringBounds() const {
1327 return fAnchoringBounds;
1328 }
1329
1330
1331 //--------------------------------------------------------------------------------
1332 //
1333 // hasTransparentBounds()
1334 //
1335 //--------------------------------------------------------------------------------
hasTransparentBounds() const1336 UBool RegexMatcher::hasTransparentBounds() const {
1337 return fTransparentBounds;
1338 }
1339
1340
1341
1342 //--------------------------------------------------------------------------------
1343 //
1344 // hitEnd()
1345 //
1346 //--------------------------------------------------------------------------------
hitEnd() const1347 UBool RegexMatcher::hitEnd() const {
1348 return fHitEnd;
1349 }
1350
1351
1352 //--------------------------------------------------------------------------------
1353 //
1354 // input()
1355 //
1356 //--------------------------------------------------------------------------------
input() const1357 const UnicodeString &RegexMatcher::input() const {
1358 if (!fInput) {
1359 UErrorCode status = U_ZERO_ERROR;
1360 int32_t len16;
1361 if (UTEXT_USES_U16(fInputText)) {
1362 len16 = (int32_t)fInputLength;
1363 } else {
1364 len16 = utext_extract(fInputText, 0, fInputLength, NULL, 0, &status);
1365 status = U_ZERO_ERROR; // overflow, length status
1366 }
1367 UnicodeString *result = new UnicodeString(len16, 0, 0);
1368
1369 UChar *inputChars = result->getBuffer(len16);
1370 utext_extract(fInputText, 0, fInputLength, inputChars, len16, &status); // unterminated warning
1371 result->releaseBuffer(len16);
1372
1373 (*(const UnicodeString **)&fInput) = result; // pointer assignment, rather than operator=
1374 }
1375
1376 return *fInput;
1377 }
1378
1379 //--------------------------------------------------------------------------------
1380 //
1381 // inputText()
1382 //
1383 //--------------------------------------------------------------------------------
inputText() const1384 UText *RegexMatcher::inputText() const {
1385 return fInputText;
1386 }
1387
1388
1389 //--------------------------------------------------------------------------------
1390 //
1391 // getInput() -- like inputText(), but makes a clone or copies into another UText
1392 //
1393 //--------------------------------------------------------------------------------
getInput(UText * dest,UErrorCode & status) const1394 UText *RegexMatcher::getInput (UText *dest, UErrorCode &status) const {
1395 if (U_FAILURE(status)) {
1396 return dest;
1397 }
1398 if (U_FAILURE(fDeferredStatus)) {
1399 status = fDeferredStatus;
1400 return dest;
1401 }
1402
1403 if (dest) {
1404 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1405 utext_replace(dest, 0, utext_nativeLength(dest), fInputText->chunkContents, (int32_t)fInputLength, &status);
1406 } else {
1407 int32_t input16Len;
1408 if (UTEXT_USES_U16(fInputText)) {
1409 input16Len = (int32_t)fInputLength;
1410 } else {
1411 UErrorCode lengthStatus = U_ZERO_ERROR;
1412 input16Len = utext_extract(fInputText, 0, fInputLength, NULL, 0, &lengthStatus); // buffer overflow error
1413 }
1414 UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(input16Len));
1415 if (inputChars == NULL) {
1416 return dest;
1417 }
1418
1419 status = U_ZERO_ERROR;
1420 utext_extract(fInputText, 0, fInputLength, inputChars, input16Len, &status); // not terminated warning
1421 status = U_ZERO_ERROR;
1422 utext_replace(dest, 0, utext_nativeLength(dest), inputChars, input16Len, &status);
1423
1424 uprv_free(inputChars);
1425 }
1426 return dest;
1427 } else {
1428 return utext_clone(NULL, fInputText, FALSE, TRUE, &status);
1429 }
1430 }
1431
1432
1433 static UBool compat_SyncMutableUTextContents(UText *ut);
compat_SyncMutableUTextContents(UText * ut)1434 static UBool compat_SyncMutableUTextContents(UText *ut) {
1435 UBool retVal = FALSE;
1436
1437 // In the following test, we're really only interested in whether the UText should switch
1438 // between heap and stack allocation. If length hasn't changed, we won't, so the chunkContents
1439 // will still point to the correct data.
1440 if (utext_nativeLength(ut) != ut->nativeIndexingLimit) {
1441 UnicodeString *us=(UnicodeString *)ut->context;
1442
1443 // Update to the latest length.
1444 // For example, (utext_nativeLength(ut) != ut->nativeIndexingLimit).
1445 int32_t newLength = us->length();
1446
1447 // Update the chunk description.
1448 // The buffer may have switched between stack- and heap-based.
1449 ut->chunkContents = us->getBuffer();
1450 ut->chunkLength = newLength;
1451 ut->chunkNativeLimit = newLength;
1452 ut->nativeIndexingLimit = newLength;
1453 retVal = TRUE;
1454 }
1455
1456 return retVal;
1457 }
1458
1459 //--------------------------------------------------------------------------------
1460 //
1461 // lookingAt()
1462 //
1463 //--------------------------------------------------------------------------------
lookingAt(UErrorCode & status)1464 UBool RegexMatcher::lookingAt(UErrorCode &status) {
1465 if (U_FAILURE(status)) {
1466 return FALSE;
1467 }
1468 if (U_FAILURE(fDeferredStatus)) {
1469 status = fDeferredStatus;
1470 return FALSE;
1471 }
1472
1473 if (fInputUniStrMaybeMutable) {
1474 if (compat_SyncMutableUTextContents(fInputText)) {
1475 fInputLength = utext_nativeLength(fInputText);
1476 reset();
1477 }
1478 }
1479 else {
1480 resetPreserveRegion();
1481 }
1482 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1483 MatchChunkAt((int32_t)fActiveStart, FALSE, status);
1484 } else {
1485 MatchAt(fActiveStart, FALSE, status);
1486 }
1487 return fMatch;
1488 }
1489
1490
lookingAt(int64_t start,UErrorCode & status)1491 UBool RegexMatcher::lookingAt(int64_t start, UErrorCode &status) {
1492 if (U_FAILURE(status)) {
1493 return FALSE;
1494 }
1495 if (U_FAILURE(fDeferredStatus)) {
1496 status = fDeferredStatus;
1497 return FALSE;
1498 }
1499 reset();
1500
1501 if (start < 0) {
1502 status = U_INDEX_OUTOFBOUNDS_ERROR;
1503 return FALSE;
1504 }
1505
1506 if (fInputUniStrMaybeMutable) {
1507 if (compat_SyncMutableUTextContents(fInputText)) {
1508 fInputLength = utext_nativeLength(fInputText);
1509 reset();
1510 }
1511 }
1512
1513 int64_t nativeStart;
1514 nativeStart = start;
1515 if (nativeStart < fActiveStart || nativeStart > fActiveLimit) {
1516 status = U_INDEX_OUTOFBOUNDS_ERROR;
1517 return FALSE;
1518 }
1519
1520 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1521 MatchChunkAt((int32_t)nativeStart, FALSE, status);
1522 } else {
1523 MatchAt(nativeStart, FALSE, status);
1524 }
1525 return fMatch;
1526 }
1527
1528
1529
1530 //--------------------------------------------------------------------------------
1531 //
1532 // matches()
1533 //
1534 //--------------------------------------------------------------------------------
matches(UErrorCode & status)1535 UBool RegexMatcher::matches(UErrorCode &status) {
1536 if (U_FAILURE(status)) {
1537 return FALSE;
1538 }
1539 if (U_FAILURE(fDeferredStatus)) {
1540 status = fDeferredStatus;
1541 return FALSE;
1542 }
1543
1544 if (fInputUniStrMaybeMutable) {
1545 if (compat_SyncMutableUTextContents(fInputText)) {
1546 fInputLength = utext_nativeLength(fInputText);
1547 reset();
1548 }
1549 }
1550 else {
1551 resetPreserveRegion();
1552 }
1553
1554 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1555 MatchChunkAt((int32_t)fActiveStart, TRUE, status);
1556 } else {
1557 MatchAt(fActiveStart, TRUE, status);
1558 }
1559 return fMatch;
1560 }
1561
1562
matches(int64_t start,UErrorCode & status)1563 UBool RegexMatcher::matches(int64_t start, UErrorCode &status) {
1564 if (U_FAILURE(status)) {
1565 return FALSE;
1566 }
1567 if (U_FAILURE(fDeferredStatus)) {
1568 status = fDeferredStatus;
1569 return FALSE;
1570 }
1571 reset();
1572
1573 if (start < 0) {
1574 status = U_INDEX_OUTOFBOUNDS_ERROR;
1575 return FALSE;
1576 }
1577
1578 if (fInputUniStrMaybeMutable) {
1579 if (compat_SyncMutableUTextContents(fInputText)) {
1580 fInputLength = utext_nativeLength(fInputText);
1581 reset();
1582 }
1583 }
1584
1585 int64_t nativeStart;
1586 nativeStart = start;
1587 if (nativeStart < fActiveStart || nativeStart > fActiveLimit) {
1588 status = U_INDEX_OUTOFBOUNDS_ERROR;
1589 return FALSE;
1590 }
1591
1592 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1593 MatchChunkAt((int32_t)nativeStart, TRUE, status);
1594 } else {
1595 MatchAt(nativeStart, TRUE, status);
1596 }
1597 return fMatch;
1598 }
1599
1600
1601
1602 //--------------------------------------------------------------------------------
1603 //
1604 // pattern
1605 //
1606 //--------------------------------------------------------------------------------
pattern() const1607 const RegexPattern &RegexMatcher::pattern() const {
1608 return *fPattern;
1609 }
1610
1611
1612
1613 //--------------------------------------------------------------------------------
1614 //
1615 // region
1616 //
1617 //--------------------------------------------------------------------------------
region(int64_t regionStart,int64_t regionLimit,int64_t startIndex,UErrorCode & status)1618 RegexMatcher &RegexMatcher::region(int64_t regionStart, int64_t regionLimit, int64_t startIndex, UErrorCode &status) {
1619 if (U_FAILURE(status)) {
1620 return *this;
1621 }
1622
1623 if (regionStart>regionLimit || regionStart<0 || regionLimit<0) {
1624 status = U_ILLEGAL_ARGUMENT_ERROR;
1625 }
1626
1627 int64_t nativeStart = regionStart;
1628 int64_t nativeLimit = regionLimit;
1629 if (nativeStart > fInputLength || nativeLimit > fInputLength) {
1630 status = U_ILLEGAL_ARGUMENT_ERROR;
1631 }
1632
1633 if (startIndex == -1)
1634 this->reset();
1635 else
1636 resetPreserveRegion();
1637
1638 fRegionStart = nativeStart;
1639 fRegionLimit = nativeLimit;
1640 fActiveStart = nativeStart;
1641 fActiveLimit = nativeLimit;
1642
1643 if (startIndex != -1) {
1644 if (startIndex < fActiveStart || startIndex > fActiveLimit) {
1645 status = U_INDEX_OUTOFBOUNDS_ERROR;
1646 }
1647 fMatchEnd = startIndex;
1648 }
1649
1650 if (!fTransparentBounds) {
1651 fLookStart = nativeStart;
1652 fLookLimit = nativeLimit;
1653 }
1654 if (fAnchoringBounds) {
1655 fAnchorStart = nativeStart;
1656 fAnchorLimit = nativeLimit;
1657 }
1658 return *this;
1659 }
1660
region(int64_t start,int64_t limit,UErrorCode & status)1661 RegexMatcher &RegexMatcher::region(int64_t start, int64_t limit, UErrorCode &status) {
1662 return region(start, limit, -1, status);
1663 }
1664
1665 //--------------------------------------------------------------------------------
1666 //
1667 // regionEnd
1668 //
1669 //--------------------------------------------------------------------------------
regionEnd() const1670 int32_t RegexMatcher::regionEnd() const {
1671 return (int32_t)fRegionLimit;
1672 }
1673
regionEnd64() const1674 int64_t RegexMatcher::regionEnd64() const {
1675 return fRegionLimit;
1676 }
1677
1678 //--------------------------------------------------------------------------------
1679 //
1680 // regionStart
1681 //
1682 //--------------------------------------------------------------------------------
regionStart() const1683 int32_t RegexMatcher::regionStart() const {
1684 return (int32_t)fRegionStart;
1685 }
1686
regionStart64() const1687 int64_t RegexMatcher::regionStart64() const {
1688 return fRegionStart;
1689 }
1690
1691
1692 //--------------------------------------------------------------------------------
1693 //
1694 // replaceAll
1695 //
1696 //--------------------------------------------------------------------------------
replaceAll(const UnicodeString & replacement,UErrorCode & status)1697 UnicodeString RegexMatcher::replaceAll(const UnicodeString &replacement, UErrorCode &status) {
1698 UText replacementText = UTEXT_INITIALIZER;
1699 UText resultText = UTEXT_INITIALIZER;
1700 UnicodeString resultString;
1701 if (U_FAILURE(status)) {
1702 return resultString;
1703 }
1704
1705 utext_openConstUnicodeString(&replacementText, &replacement, &status);
1706 utext_openUnicodeString(&resultText, &resultString, &status);
1707
1708 replaceAll(&replacementText, &resultText, status);
1709
1710 utext_close(&resultText);
1711 utext_close(&replacementText);
1712
1713 return resultString;
1714 }
1715
1716
1717 //
1718 // replaceAll, UText mode
1719 //
replaceAll(UText * replacement,UText * dest,UErrorCode & status)1720 UText *RegexMatcher::replaceAll(UText *replacement, UText *dest, UErrorCode &status) {
1721 if (U_FAILURE(status)) {
1722 return dest;
1723 }
1724 if (U_FAILURE(fDeferredStatus)) {
1725 status = fDeferredStatus;
1726 return dest;
1727 }
1728
1729 if (dest == NULL) {
1730 UnicodeString emptyString;
1731 UText empty = UTEXT_INITIALIZER;
1732
1733 utext_openUnicodeString(&empty, &emptyString, &status);
1734 dest = utext_clone(NULL, &empty, TRUE, FALSE, &status);
1735 utext_close(&empty);
1736 }
1737
1738 if (U_SUCCESS(status)) {
1739 reset();
1740 while (find()) {
1741 appendReplacement(dest, replacement, status);
1742 if (U_FAILURE(status)) {
1743 break;
1744 }
1745 }
1746 appendTail(dest, status);
1747 }
1748
1749 return dest;
1750 }
1751
1752
1753 //--------------------------------------------------------------------------------
1754 //
1755 // replaceFirst
1756 //
1757 //--------------------------------------------------------------------------------
replaceFirst(const UnicodeString & replacement,UErrorCode & status)1758 UnicodeString RegexMatcher::replaceFirst(const UnicodeString &replacement, UErrorCode &status) {
1759 UText replacementText = UTEXT_INITIALIZER;
1760 UText resultText = UTEXT_INITIALIZER;
1761 UnicodeString resultString;
1762
1763 utext_openConstUnicodeString(&replacementText, &replacement, &status);
1764 utext_openUnicodeString(&resultText, &resultString, &status);
1765
1766 replaceFirst(&replacementText, &resultText, status);
1767
1768 utext_close(&resultText);
1769 utext_close(&replacementText);
1770
1771 return resultString;
1772 }
1773
1774 //
1775 // replaceFirst, UText mode
1776 //
replaceFirst(UText * replacement,UText * dest,UErrorCode & status)1777 UText *RegexMatcher::replaceFirst(UText *replacement, UText *dest, UErrorCode &status) {
1778 if (U_FAILURE(status)) {
1779 return dest;
1780 }
1781 if (U_FAILURE(fDeferredStatus)) {
1782 status = fDeferredStatus;
1783 return dest;
1784 }
1785
1786 reset();
1787 if (!find()) {
1788 return getInput(dest, status);
1789 }
1790
1791 if (dest == NULL) {
1792 UnicodeString emptyString;
1793 UText empty = UTEXT_INITIALIZER;
1794
1795 utext_openUnicodeString(&empty, &emptyString, &status);
1796 dest = utext_clone(NULL, &empty, TRUE, FALSE, &status);
1797 utext_close(&empty);
1798 }
1799
1800 appendReplacement(dest, replacement, status);
1801 appendTail(dest, status);
1802
1803 return dest;
1804 }
1805
1806
1807 //--------------------------------------------------------------------------------
1808 //
1809 // requireEnd
1810 //
1811 //--------------------------------------------------------------------------------
requireEnd() const1812 UBool RegexMatcher::requireEnd() const {
1813 return fRequireEnd;
1814 }
1815
1816
1817 //--------------------------------------------------------------------------------
1818 //
1819 // reset
1820 //
1821 //--------------------------------------------------------------------------------
reset()1822 RegexMatcher &RegexMatcher::reset() {
1823 fRegionStart = 0;
1824 fRegionLimit = fInputLength;
1825 fActiveStart = 0;
1826 fActiveLimit = fInputLength;
1827 fAnchorStart = 0;
1828 fAnchorLimit = fInputLength;
1829 fLookStart = 0;
1830 fLookLimit = fInputLength;
1831 resetPreserveRegion();
1832 return *this;
1833 }
1834
1835
1836
resetPreserveRegion()1837 void RegexMatcher::resetPreserveRegion() {
1838 fMatchStart = 0;
1839 fMatchEnd = 0;
1840 fLastMatchEnd = -1;
1841 fAppendPosition = 0;
1842 fMatch = FALSE;
1843 fHitEnd = FALSE;
1844 fRequireEnd = FALSE;
1845 fTime = 0;
1846 fTickCounter = TIMER_INITIAL_VALUE;
1847 //resetStack(); // more expensive than it looks...
1848 }
1849
1850
reset(const UnicodeString & input)1851 RegexMatcher &RegexMatcher::reset(const UnicodeString &input) {
1852 fInputText = utext_openConstUnicodeString(fInputText, &input, &fDeferredStatus);
1853 if (fPattern->fNeedsAltInput) {
1854 fAltInputText = utext_clone(fAltInputText, fInputText, FALSE, TRUE, &fDeferredStatus);
1855 }
1856 if (U_FAILURE(fDeferredStatus)) {
1857 return *this;
1858 }
1859 fInputLength = utext_nativeLength(fInputText);
1860
1861 reset();
1862 delete fInput;
1863 fInput = NULL;
1864
1865 // Do the following for any UnicodeString.
1866 // This is for compatibility for those clients who modify the input string "live" during regex operations.
1867 fInputUniStrMaybeMutable = TRUE;
1868
1869 #if UCONFIG_NO_BREAK_ITERATION==0
1870 if (fWordBreakItr) {
1871 fWordBreakItr->setText(fInputText, fDeferredStatus);
1872 }
1873 if (fGCBreakItr) {
1874 fGCBreakItr->setText(fInputText, fDeferredStatus);
1875 }
1876 #endif
1877
1878 return *this;
1879 }
1880
1881
reset(UText * input)1882 RegexMatcher &RegexMatcher::reset(UText *input) {
1883 if (fInputText != input) {
1884 fInputText = utext_clone(fInputText, input, FALSE, TRUE, &fDeferredStatus);
1885 if (fPattern->fNeedsAltInput) fAltInputText = utext_clone(fAltInputText, fInputText, FALSE, TRUE, &fDeferredStatus);
1886 if (U_FAILURE(fDeferredStatus)) {
1887 return *this;
1888 }
1889 fInputLength = utext_nativeLength(fInputText);
1890
1891 delete fInput;
1892 fInput = NULL;
1893
1894 #if UCONFIG_NO_BREAK_ITERATION==0
1895 if (fWordBreakItr) {
1896 fWordBreakItr->setText(input, fDeferredStatus);
1897 }
1898 if (fGCBreakItr) {
1899 fGCBreakItr->setText(fInputText, fDeferredStatus);
1900 }
1901 #endif
1902 }
1903 reset();
1904 fInputUniStrMaybeMutable = FALSE;
1905
1906 return *this;
1907 }
1908
1909 /*RegexMatcher &RegexMatcher::reset(const UChar *) {
1910 fDeferredStatus = U_INTERNAL_PROGRAM_ERROR;
1911 return *this;
1912 }*/
1913
reset(int64_t position,UErrorCode & status)1914 RegexMatcher &RegexMatcher::reset(int64_t position, UErrorCode &status) {
1915 if (U_FAILURE(status)) {
1916 return *this;
1917 }
1918 reset(); // Reset also resets the region to be the entire string.
1919
1920 if (position < 0 || position > fActiveLimit) {
1921 status = U_INDEX_OUTOFBOUNDS_ERROR;
1922 return *this;
1923 }
1924 fMatchEnd = position;
1925 return *this;
1926 }
1927
1928
1929 //--------------------------------------------------------------------------------
1930 //
1931 // refresh
1932 //
1933 //--------------------------------------------------------------------------------
refreshInputText(UText * input,UErrorCode & status)1934 RegexMatcher &RegexMatcher::refreshInputText(UText *input, UErrorCode &status) {
1935 if (U_FAILURE(status)) {
1936 return *this;
1937 }
1938 if (input == NULL) {
1939 status = U_ILLEGAL_ARGUMENT_ERROR;
1940 return *this;
1941 }
1942 if (utext_nativeLength(fInputText) != utext_nativeLength(input)) {
1943 status = U_ILLEGAL_ARGUMENT_ERROR;
1944 return *this;
1945 }
1946 int64_t pos = utext_getNativeIndex(fInputText);
1947 // Shallow read-only clone of the new UText into the existing input UText
1948 fInputText = utext_clone(fInputText, input, FALSE, TRUE, &status);
1949 if (U_FAILURE(status)) {
1950 return *this;
1951 }
1952 utext_setNativeIndex(fInputText, pos);
1953
1954 if (fAltInputText != NULL) {
1955 pos = utext_getNativeIndex(fAltInputText);
1956 fAltInputText = utext_clone(fAltInputText, input, FALSE, TRUE, &status);
1957 if (U_FAILURE(status)) {
1958 return *this;
1959 }
1960 utext_setNativeIndex(fAltInputText, pos);
1961 }
1962 return *this;
1963 }
1964
1965
1966
1967 //--------------------------------------------------------------------------------
1968 //
1969 // setTrace
1970 //
1971 //--------------------------------------------------------------------------------
setTrace(UBool state)1972 void RegexMatcher::setTrace(UBool state) {
1973 fTraceDebug = state;
1974 }
1975
1976
1977
1978 /**
1979 * UText, replace entire contents of the destination UText with a substring of the source UText.
1980 *
1981 * @param src The source UText
1982 * @param dest The destination UText. Must be writable.
1983 * May be NULL, in which case a new UText will be allocated.
1984 * @param start Start index of source substring.
1985 * @param limit Limit index of source substring.
1986 * @param status An error code.
1987 */
utext_extract_replace(UText * src,UText * dest,int64_t start,int64_t limit,UErrorCode * status)1988 static UText *utext_extract_replace(UText *src, UText *dest, int64_t start, int64_t limit, UErrorCode *status) {
1989 if (U_FAILURE(*status)) {
1990 return dest;
1991 }
1992 if (start == limit) {
1993 if (dest) {
1994 utext_replace(dest, 0, utext_nativeLength(dest), NULL, 0, status);
1995 return dest;
1996 } else {
1997 return utext_openUChars(NULL, NULL, 0, status);
1998 }
1999 }
2000 int32_t length = utext_extract(src, start, limit, NULL, 0, status);
2001 if (*status != U_BUFFER_OVERFLOW_ERROR && U_FAILURE(*status)) {
2002 return dest;
2003 }
2004 *status = U_ZERO_ERROR;
2005 MaybeStackArray<UChar, 40> buffer;
2006 if (length >= buffer.getCapacity()) {
2007 UChar *newBuf = buffer.resize(length+1); // Leave space for terminating Nul.
2008 if (newBuf == NULL) {
2009 *status = U_MEMORY_ALLOCATION_ERROR;
2010 }
2011 }
2012 utext_extract(src, start, limit, buffer.getAlias(), length+1, status);
2013 if (dest) {
2014 utext_replace(dest, 0, utext_nativeLength(dest), buffer.getAlias(), length, status);
2015 return dest;
2016 }
2017
2018 // Caller did not provide a preexisting UText.
2019 // Open a new one, and have it adopt the text buffer storage.
2020 if (U_FAILURE(*status)) {
2021 return NULL;
2022 }
2023 int32_t ownedLength = 0;
2024 UChar *ownedBuf = buffer.orphanOrClone(length+1, ownedLength);
2025 if (ownedBuf == NULL) {
2026 *status = U_MEMORY_ALLOCATION_ERROR;
2027 return NULL;
2028 }
2029 UText *result = utext_openUChars(NULL, ownedBuf, length, status);
2030 if (U_FAILURE(*status)) {
2031 uprv_free(ownedBuf);
2032 return NULL;
2033 }
2034 result->providerProperties |= (1 << UTEXT_PROVIDER_OWNS_TEXT);
2035 return result;
2036 }
2037
2038
2039 //---------------------------------------------------------------------
2040 //
2041 // split
2042 //
2043 //---------------------------------------------------------------------
split(const UnicodeString & input,UnicodeString dest[],int32_t destCapacity,UErrorCode & status)2044 int32_t RegexMatcher::split(const UnicodeString &input,
2045 UnicodeString dest[],
2046 int32_t destCapacity,
2047 UErrorCode &status)
2048 {
2049 UText inputText = UTEXT_INITIALIZER;
2050 utext_openConstUnicodeString(&inputText, &input, &status);
2051 if (U_FAILURE(status)) {
2052 return 0;
2053 }
2054
2055 UText **destText = (UText **)uprv_malloc(sizeof(UText*)*destCapacity);
2056 if (destText == NULL) {
2057 status = U_MEMORY_ALLOCATION_ERROR;
2058 return 0;
2059 }
2060 int32_t i;
2061 for (i = 0; i < destCapacity; i++) {
2062 destText[i] = utext_openUnicodeString(NULL, &dest[i], &status);
2063 }
2064
2065 int32_t fieldCount = split(&inputText, destText, destCapacity, status);
2066
2067 for (i = 0; i < destCapacity; i++) {
2068 utext_close(destText[i]);
2069 }
2070
2071 uprv_free(destText);
2072 utext_close(&inputText);
2073 return fieldCount;
2074 }
2075
2076 //
2077 // split, UText mode
2078 //
split(UText * input,UText * dest[],int32_t destCapacity,UErrorCode & status)2079 int32_t RegexMatcher::split(UText *input,
2080 UText *dest[],
2081 int32_t destCapacity,
2082 UErrorCode &status)
2083 {
2084 //
2085 // Check arguments for validity
2086 //
2087 if (U_FAILURE(status)) {
2088 return 0;
2089 }
2090
2091 if (destCapacity < 1) {
2092 status = U_ILLEGAL_ARGUMENT_ERROR;
2093 return 0;
2094 }
2095
2096 //
2097 // Reset for the input text
2098 //
2099 reset(input);
2100 int64_t nextOutputStringStart = 0;
2101 if (fActiveLimit == 0) {
2102 return 0;
2103 }
2104
2105 //
2106 // Loop through the input text, searching for the delimiter pattern
2107 //
2108 int32_t i;
2109 int32_t numCaptureGroups = fPattern->fGroupMap->size();
2110 for (i=0; ; i++) {
2111 if (i>=destCapacity-1) {
2112 // There is one or zero output string left.
2113 // Fill the last output string with whatever is left from the input, then exit the loop.
2114 // ( i will be == destCapacity if we filled the output array while processing
2115 // capture groups of the delimiter expression, in which case we will discard the
2116 // last capture group saved in favor of the unprocessed remainder of the
2117 // input string.)
2118 i = destCapacity-1;
2119 if (fActiveLimit > nextOutputStringStart) {
2120 if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) {
2121 if (dest[i]) {
2122 utext_replace(dest[i], 0, utext_nativeLength(dest[i]),
2123 input->chunkContents+nextOutputStringStart,
2124 (int32_t)(fActiveLimit-nextOutputStringStart), &status);
2125 } else {
2126 UText remainingText = UTEXT_INITIALIZER;
2127 utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart,
2128 fActiveLimit-nextOutputStringStart, &status);
2129 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2130 utext_close(&remainingText);
2131 }
2132 } else {
2133 UErrorCode lengthStatus = U_ZERO_ERROR;
2134 int32_t remaining16Length =
2135 utext_extract(input, nextOutputStringStart, fActiveLimit, NULL, 0, &lengthStatus);
2136 UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1));
2137 if (remainingChars == NULL) {
2138 status = U_MEMORY_ALLOCATION_ERROR;
2139 break;
2140 }
2141
2142 utext_extract(input, nextOutputStringStart, fActiveLimit, remainingChars, remaining16Length+1, &status);
2143 if (dest[i]) {
2144 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status);
2145 } else {
2146 UText remainingText = UTEXT_INITIALIZER;
2147 utext_openUChars(&remainingText, remainingChars, remaining16Length, &status);
2148 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2149 utext_close(&remainingText);
2150 }
2151
2152 uprv_free(remainingChars);
2153 }
2154 }
2155 break;
2156 }
2157 if (find()) {
2158 // We found another delimiter. Move everything from where we started looking
2159 // up until the start of the delimiter into the next output string.
2160 if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) {
2161 if (dest[i]) {
2162 utext_replace(dest[i], 0, utext_nativeLength(dest[i]),
2163 input->chunkContents+nextOutputStringStart,
2164 (int32_t)(fMatchStart-nextOutputStringStart), &status);
2165 } else {
2166 UText remainingText = UTEXT_INITIALIZER;
2167 utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart,
2168 fMatchStart-nextOutputStringStart, &status);
2169 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2170 utext_close(&remainingText);
2171 }
2172 } else {
2173 UErrorCode lengthStatus = U_ZERO_ERROR;
2174 int32_t remaining16Length = utext_extract(input, nextOutputStringStart, fMatchStart, NULL, 0, &lengthStatus);
2175 UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1));
2176 if (remainingChars == NULL) {
2177 status = U_MEMORY_ALLOCATION_ERROR;
2178 break;
2179 }
2180 utext_extract(input, nextOutputStringStart, fMatchStart, remainingChars, remaining16Length+1, &status);
2181 if (dest[i]) {
2182 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status);
2183 } else {
2184 UText remainingText = UTEXT_INITIALIZER;
2185 utext_openUChars(&remainingText, remainingChars, remaining16Length, &status);
2186 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2187 utext_close(&remainingText);
2188 }
2189
2190 uprv_free(remainingChars);
2191 }
2192 nextOutputStringStart = fMatchEnd;
2193
2194 // If the delimiter pattern has capturing parentheses, the captured
2195 // text goes out into the next n destination strings.
2196 int32_t groupNum;
2197 for (groupNum=1; groupNum<=numCaptureGroups; groupNum++) {
2198 if (i >= destCapacity-2) {
2199 // Never fill the last available output string with capture group text.
2200 // It will filled with the last field, the remainder of the
2201 // unsplit input text.
2202 break;
2203 }
2204 i++;
2205 dest[i] = utext_extract_replace(fInputText, dest[i],
2206 start64(groupNum, status), end64(groupNum, status), &status);
2207 }
2208
2209 if (nextOutputStringStart == fActiveLimit) {
2210 // The delimiter was at the end of the string. We're done, but first
2211 // we output one last empty string, for the empty field following
2212 // the delimiter at the end of input.
2213 if (i+1 < destCapacity) {
2214 ++i;
2215 if (dest[i] == NULL) {
2216 dest[i] = utext_openUChars(NULL, NULL, 0, &status);
2217 } else {
2218 static const UChar emptyString[] = {(UChar)0};
2219 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), emptyString, 0, &status);
2220 }
2221 }
2222 break;
2223
2224 }
2225 }
2226 else
2227 {
2228 // We ran off the end of the input while looking for the next delimiter.
2229 // All the remaining text goes into the current output string.
2230 if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) {
2231 if (dest[i]) {
2232 utext_replace(dest[i], 0, utext_nativeLength(dest[i]),
2233 input->chunkContents+nextOutputStringStart,
2234 (int32_t)(fActiveLimit-nextOutputStringStart), &status);
2235 } else {
2236 UText remainingText = UTEXT_INITIALIZER;
2237 utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart,
2238 fActiveLimit-nextOutputStringStart, &status);
2239 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2240 utext_close(&remainingText);
2241 }
2242 } else {
2243 UErrorCode lengthStatus = U_ZERO_ERROR;
2244 int32_t remaining16Length = utext_extract(input, nextOutputStringStart, fActiveLimit, NULL, 0, &lengthStatus);
2245 UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1));
2246 if (remainingChars == NULL) {
2247 status = U_MEMORY_ALLOCATION_ERROR;
2248 break;
2249 }
2250
2251 utext_extract(input, nextOutputStringStart, fActiveLimit, remainingChars, remaining16Length+1, &status);
2252 if (dest[i]) {
2253 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status);
2254 } else {
2255 UText remainingText = UTEXT_INITIALIZER;
2256 utext_openUChars(&remainingText, remainingChars, remaining16Length, &status);
2257 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2258 utext_close(&remainingText);
2259 }
2260
2261 uprv_free(remainingChars);
2262 }
2263 break;
2264 }
2265 if (U_FAILURE(status)) {
2266 break;
2267 }
2268 } // end of for loop
2269 return i+1;
2270 }
2271
2272
2273 //--------------------------------------------------------------------------------
2274 //
2275 // start
2276 //
2277 //--------------------------------------------------------------------------------
start(UErrorCode & status) const2278 int32_t RegexMatcher::start(UErrorCode &status) const {
2279 return start(0, status);
2280 }
2281
start64(UErrorCode & status) const2282 int64_t RegexMatcher::start64(UErrorCode &status) const {
2283 return start64(0, status);
2284 }
2285
2286 //--------------------------------------------------------------------------------
2287 //
2288 // start(int32_t group, UErrorCode &status)
2289 //
2290 //--------------------------------------------------------------------------------
2291
start64(int32_t group,UErrorCode & status) const2292 int64_t RegexMatcher::start64(int32_t group, UErrorCode &status) const {
2293 if (U_FAILURE(status)) {
2294 return -1;
2295 }
2296 if (U_FAILURE(fDeferredStatus)) {
2297 status = fDeferredStatus;
2298 return -1;
2299 }
2300 if (fMatch == FALSE) {
2301 status = U_REGEX_INVALID_STATE;
2302 return -1;
2303 }
2304 if (group < 0 || group > fPattern->fGroupMap->size()) {
2305 status = U_INDEX_OUTOFBOUNDS_ERROR;
2306 return -1;
2307 }
2308 int64_t s;
2309 if (group == 0) {
2310 s = fMatchStart;
2311 } else {
2312 int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1);
2313 U_ASSERT(groupOffset < fPattern->fFrameSize);
2314 U_ASSERT(groupOffset >= 0);
2315 s = fFrame->fExtra[groupOffset];
2316 }
2317
2318 return s;
2319 }
2320
2321
start(int32_t group,UErrorCode & status) const2322 int32_t RegexMatcher::start(int32_t group, UErrorCode &status) const {
2323 return (int32_t)start64(group, status);
2324 }
2325
2326 //--------------------------------------------------------------------------------
2327 //
2328 // useAnchoringBounds
2329 //
2330 //--------------------------------------------------------------------------------
useAnchoringBounds(UBool b)2331 RegexMatcher &RegexMatcher::useAnchoringBounds(UBool b) {
2332 fAnchoringBounds = b;
2333 fAnchorStart = (fAnchoringBounds ? fRegionStart : 0);
2334 fAnchorLimit = (fAnchoringBounds ? fRegionLimit : fInputLength);
2335 return *this;
2336 }
2337
2338
2339 //--------------------------------------------------------------------------------
2340 //
2341 // useTransparentBounds
2342 //
2343 //--------------------------------------------------------------------------------
useTransparentBounds(UBool b)2344 RegexMatcher &RegexMatcher::useTransparentBounds(UBool b) {
2345 fTransparentBounds = b;
2346 fLookStart = (fTransparentBounds ? 0 : fRegionStart);
2347 fLookLimit = (fTransparentBounds ? fInputLength : fRegionLimit);
2348 return *this;
2349 }
2350
2351 //--------------------------------------------------------------------------------
2352 //
2353 // setTimeLimit
2354 //
2355 //--------------------------------------------------------------------------------
setTimeLimit(int32_t limit,UErrorCode & status)2356 void RegexMatcher::setTimeLimit(int32_t limit, UErrorCode &status) {
2357 if (U_FAILURE(status)) {
2358 return;
2359 }
2360 if (U_FAILURE(fDeferredStatus)) {
2361 status = fDeferredStatus;
2362 return;
2363 }
2364 if (limit < 0) {
2365 status = U_ILLEGAL_ARGUMENT_ERROR;
2366 return;
2367 }
2368 fTimeLimit = limit;
2369 }
2370
2371
2372 //--------------------------------------------------------------------------------
2373 //
2374 // getTimeLimit
2375 //
2376 //--------------------------------------------------------------------------------
getTimeLimit() const2377 int32_t RegexMatcher::getTimeLimit() const {
2378 return fTimeLimit;
2379 }
2380
2381
2382 //--------------------------------------------------------------------------------
2383 //
2384 // setStackLimit
2385 //
2386 //--------------------------------------------------------------------------------
setStackLimit(int32_t limit,UErrorCode & status)2387 void RegexMatcher::setStackLimit(int32_t limit, UErrorCode &status) {
2388 if (U_FAILURE(status)) {
2389 return;
2390 }
2391 if (U_FAILURE(fDeferredStatus)) {
2392 status = fDeferredStatus;
2393 return;
2394 }
2395 if (limit < 0) {
2396 status = U_ILLEGAL_ARGUMENT_ERROR;
2397 return;
2398 }
2399
2400 // Reset the matcher. This is needed here in case there is a current match
2401 // whose final stack frame (containing the match results, pointed to by fFrame)
2402 // would be lost by resizing to a smaller stack size.
2403 reset();
2404
2405 if (limit == 0) {
2406 // Unlimited stack expansion
2407 fStack->setMaxCapacity(0);
2408 } else {
2409 // Change the units of the limit from bytes to ints, and bump the size up
2410 // to be big enough to hold at least one stack frame for the pattern,
2411 // if it isn't there already.
2412 int32_t adjustedLimit = limit / sizeof(int32_t);
2413 if (adjustedLimit < fPattern->fFrameSize) {
2414 adjustedLimit = fPattern->fFrameSize;
2415 }
2416 fStack->setMaxCapacity(adjustedLimit);
2417 }
2418 fStackLimit = limit;
2419 }
2420
2421
2422 //--------------------------------------------------------------------------------
2423 //
2424 // getStackLimit
2425 //
2426 //--------------------------------------------------------------------------------
getStackLimit() const2427 int32_t RegexMatcher::getStackLimit() const {
2428 return fStackLimit;
2429 }
2430
2431
2432 //--------------------------------------------------------------------------------
2433 //
2434 // setMatchCallback
2435 //
2436 //--------------------------------------------------------------------------------
setMatchCallback(URegexMatchCallback * callback,const void * context,UErrorCode & status)2437 void RegexMatcher::setMatchCallback(URegexMatchCallback *callback,
2438 const void *context,
2439 UErrorCode &status) {
2440 if (U_FAILURE(status)) {
2441 return;
2442 }
2443 fCallbackFn = callback;
2444 fCallbackContext = context;
2445 }
2446
2447
2448 //--------------------------------------------------------------------------------
2449 //
2450 // getMatchCallback
2451 //
2452 //--------------------------------------------------------------------------------
getMatchCallback(URegexMatchCallback * & callback,const void * & context,UErrorCode & status)2453 void RegexMatcher::getMatchCallback(URegexMatchCallback *&callback,
2454 const void *&context,
2455 UErrorCode &status) {
2456 if (U_FAILURE(status)) {
2457 return;
2458 }
2459 callback = fCallbackFn;
2460 context = fCallbackContext;
2461 }
2462
2463
2464 //--------------------------------------------------------------------------------
2465 //
2466 // setMatchCallback
2467 //
2468 //--------------------------------------------------------------------------------
setFindProgressCallback(URegexFindProgressCallback * callback,const void * context,UErrorCode & status)2469 void RegexMatcher::setFindProgressCallback(URegexFindProgressCallback *callback,
2470 const void *context,
2471 UErrorCode &status) {
2472 if (U_FAILURE(status)) {
2473 return;
2474 }
2475 fFindProgressCallbackFn = callback;
2476 fFindProgressCallbackContext = context;
2477 }
2478
2479
2480 //--------------------------------------------------------------------------------
2481 //
2482 // getMatchCallback
2483 //
2484 //--------------------------------------------------------------------------------
getFindProgressCallback(URegexFindProgressCallback * & callback,const void * & context,UErrorCode & status)2485 void RegexMatcher::getFindProgressCallback(URegexFindProgressCallback *&callback,
2486 const void *&context,
2487 UErrorCode &status) {
2488 if (U_FAILURE(status)) {
2489 return;
2490 }
2491 callback = fFindProgressCallbackFn;
2492 context = fFindProgressCallbackContext;
2493 }
2494
2495
2496 //================================================================================
2497 //
2498 // Code following this point in this file is the internal
2499 // Match Engine Implementation.
2500 //
2501 //================================================================================
2502
2503
2504 //--------------------------------------------------------------------------------
2505 //
2506 // resetStack
2507 // Discard any previous contents of the state save stack, and initialize a
2508 // new stack frame to all -1. The -1s are needed for capture group limits,
2509 // where they indicate that a group has not yet matched anything.
2510 //--------------------------------------------------------------------------------
resetStack()2511 REStackFrame *RegexMatcher::resetStack() {
2512 // Discard any previous contents of the state save stack, and initialize a
2513 // new stack frame with all -1 data. The -1s are needed for capture group limits,
2514 // where they indicate that a group has not yet matched anything.
2515 fStack->removeAllElements();
2516
2517 REStackFrame *iFrame = (REStackFrame *)fStack->reserveBlock(fPattern->fFrameSize, fDeferredStatus);
2518 if(U_FAILURE(fDeferredStatus)) {
2519 return NULL;
2520 }
2521
2522 int32_t i;
2523 for (i=0; i<fPattern->fFrameSize-RESTACKFRAME_HDRCOUNT; i++) {
2524 iFrame->fExtra[i] = -1;
2525 }
2526 return iFrame;
2527 }
2528
2529
2530
2531 //--------------------------------------------------------------------------------
2532 //
2533 // isWordBoundary
2534 // in perl, "xab..cd..", \b is true at positions 0,3,5,7
2535 // For us,
2536 // If the current char is a combining mark,
2537 // \b is FALSE.
2538 // Else Scan backwards to the first non-combining char.
2539 // We are at a boundary if the this char and the original chars are
2540 // opposite in membership in \w set
2541 //
2542 // parameters: pos - the current position in the input buffer
2543 //
2544 // TODO: double-check edge cases at region boundaries.
2545 //
2546 //--------------------------------------------------------------------------------
isWordBoundary(int64_t pos)2547 UBool RegexMatcher::isWordBoundary(int64_t pos) {
2548 UBool isBoundary = FALSE;
2549 UBool cIsWord = FALSE;
2550
2551 if (pos >= fLookLimit) {
2552 fHitEnd = TRUE;
2553 } else {
2554 // Determine whether char c at current position is a member of the word set of chars.
2555 // If we're off the end of the string, behave as though we're not at a word char.
2556 UTEXT_SETNATIVEINDEX(fInputText, pos);
2557 UChar32 c = UTEXT_CURRENT32(fInputText);
2558 if (u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND) || u_charType(c) == U_FORMAT_CHAR) {
2559 // Current char is a combining one. Not a boundary.
2560 return FALSE;
2561 }
2562 cIsWord = RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET].contains(c);
2563 }
2564
2565 // Back up until we come to a non-combining char, determine whether
2566 // that char is a word char.
2567 UBool prevCIsWord = FALSE;
2568 for (;;) {
2569 if (UTEXT_GETNATIVEINDEX(fInputText) <= fLookStart) {
2570 break;
2571 }
2572 UChar32 prevChar = UTEXT_PREVIOUS32(fInputText);
2573 if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND)
2574 || u_charType(prevChar) == U_FORMAT_CHAR)) {
2575 prevCIsWord = RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET].contains(prevChar);
2576 break;
2577 }
2578 }
2579 isBoundary = cIsWord ^ prevCIsWord;
2580 return isBoundary;
2581 }
2582
isChunkWordBoundary(int32_t pos)2583 UBool RegexMatcher::isChunkWordBoundary(int32_t pos) {
2584 UBool isBoundary = FALSE;
2585 UBool cIsWord = FALSE;
2586
2587 const UChar *inputBuf = fInputText->chunkContents;
2588
2589 if (pos >= fLookLimit) {
2590 fHitEnd = TRUE;
2591 } else {
2592 // Determine whether char c at current position is a member of the word set of chars.
2593 // If we're off the end of the string, behave as though we're not at a word char.
2594 UChar32 c;
2595 U16_GET(inputBuf, fLookStart, pos, fLookLimit, c);
2596 if (u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND) || u_charType(c) == U_FORMAT_CHAR) {
2597 // Current char is a combining one. Not a boundary.
2598 return FALSE;
2599 }
2600 cIsWord = RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET].contains(c);
2601 }
2602
2603 // Back up until we come to a non-combining char, determine whether
2604 // that char is a word char.
2605 UBool prevCIsWord = FALSE;
2606 for (;;) {
2607 if (pos <= fLookStart) {
2608 break;
2609 }
2610 UChar32 prevChar;
2611 U16_PREV(inputBuf, fLookStart, pos, prevChar);
2612 if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND)
2613 || u_charType(prevChar) == U_FORMAT_CHAR)) {
2614 prevCIsWord = RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET].contains(prevChar);
2615 break;
2616 }
2617 }
2618 isBoundary = cIsWord ^ prevCIsWord;
2619 return isBoundary;
2620 }
2621
2622 //--------------------------------------------------------------------------------
2623 //
2624 // isUWordBoundary
2625 //
2626 // Test for a word boundary using RBBI word break.
2627 //
2628 // parameters: pos - the current position in the input buffer
2629 //
2630 //--------------------------------------------------------------------------------
isUWordBoundary(int64_t pos,UErrorCode & status)2631 UBool RegexMatcher::isUWordBoundary(int64_t pos, UErrorCode &status) {
2632 UBool returnVal = FALSE;
2633
2634 #if UCONFIG_NO_BREAK_ITERATION==0
2635 // Note: this point will never be reached if break iteration is configured out.
2636 // Regex patterns that would require this function will fail to compile.
2637
2638 // If we haven't yet created a break iterator for this matcher, do it now.
2639 if (fWordBreakItr == nullptr) {
2640 fWordBreakItr = BreakIterator::createWordInstance(Locale::getEnglish(), status);
2641 if (U_FAILURE(status)) {
2642 return FALSE;
2643 }
2644 fWordBreakItr->setText(fInputText, status);
2645 }
2646
2647 // Note: zero width boundary tests like \b see through transparent region bounds,
2648 // which is why fLookLimit is used here, rather than fActiveLimit.
2649 if (pos >= fLookLimit) {
2650 fHitEnd = TRUE;
2651 returnVal = TRUE; // With Unicode word rules, only positions within the interior of "real"
2652 // words are not boundaries. All non-word chars stand by themselves,
2653 // with word boundaries on both sides.
2654 } else {
2655 returnVal = fWordBreakItr->isBoundary((int32_t)pos);
2656 }
2657 #endif
2658 return returnVal;
2659 }
2660
2661
followingGCBoundary(int64_t pos,UErrorCode & status)2662 int64_t RegexMatcher::followingGCBoundary(int64_t pos, UErrorCode &status) {
2663 int64_t result = pos;
2664
2665 #if UCONFIG_NO_BREAK_ITERATION==0
2666 // Note: this point will never be reached if break iteration is configured out.
2667 // Regex patterns that would require this function will fail to compile.
2668
2669 // If we haven't yet created a break iterator for this matcher, do it now.
2670 if (fGCBreakItr == nullptr) {
2671 fGCBreakItr = BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
2672 if (U_FAILURE(status)) {
2673 return pos;
2674 }
2675 fGCBreakItr->setText(fInputText, status);
2676 }
2677 result = fGCBreakItr->following(pos);
2678 if (result == BreakIterator::DONE) {
2679 result = pos;
2680 }
2681 #endif
2682 return result;
2683 }
2684
2685 //--------------------------------------------------------------------------------
2686 //
2687 // IncrementTime This function is called once each TIMER_INITIAL_VALUE state
2688 // saves. Increment the "time" counter, and call the
2689 // user callback function if there is one installed.
2690 //
2691 // If the match operation needs to be aborted, either for a time-out
2692 // or because the user callback asked for it, just set an error status.
2693 // The engine will pick that up and stop in its outer loop.
2694 //
2695 //--------------------------------------------------------------------------------
IncrementTime(UErrorCode & status)2696 void RegexMatcher::IncrementTime(UErrorCode &status) {
2697 fTickCounter = TIMER_INITIAL_VALUE;
2698 fTime++;
2699 if (fCallbackFn != NULL) {
2700 if ((*fCallbackFn)(fCallbackContext, fTime) == FALSE) {
2701 status = U_REGEX_STOPPED_BY_CALLER;
2702 return;
2703 }
2704 }
2705 if (fTimeLimit > 0 && fTime >= fTimeLimit) {
2706 status = U_REGEX_TIME_OUT;
2707 }
2708 }
2709
2710 //--------------------------------------------------------------------------------
2711 //
2712 // StateSave
2713 // Make a new stack frame, initialized as a copy of the current stack frame.
2714 // Set the pattern index in the original stack frame from the operand value
2715 // in the opcode. Execution of the engine continues with the state in
2716 // the newly created stack frame
2717 //
2718 // Note that reserveBlock() may grow the stack, resulting in the
2719 // whole thing being relocated in memory.
2720 //
2721 // Parameters:
2722 // fp The top frame pointer when called. At return, a new
2723 // fame will be present
2724 // savePatIdx An index into the compiled pattern. Goes into the original
2725 // (not new) frame. If execution ever back-tracks out of the
2726 // new frame, this will be where we continue from in the pattern.
2727 // Return
2728 // The new frame pointer.
2729 //
2730 //--------------------------------------------------------------------------------
StateSave(REStackFrame * fp,int64_t savePatIdx,UErrorCode & status)2731 inline REStackFrame *RegexMatcher::StateSave(REStackFrame *fp, int64_t savePatIdx, UErrorCode &status) {
2732 if (U_FAILURE(status)) {
2733 return fp;
2734 }
2735 // push storage for a new frame.
2736 int64_t *newFP = fStack->reserveBlock(fFrameSize, status);
2737 if (U_FAILURE(status)) {
2738 // Failure on attempted stack expansion.
2739 // Stack function set some other error code, change it to a more
2740 // specific one for regular expressions.
2741 status = U_REGEX_STACK_OVERFLOW;
2742 // We need to return a writable stack frame, so just return the
2743 // previous frame. The match operation will stop quickly
2744 // because of the error status, after which the frame will never
2745 // be looked at again.
2746 return fp;
2747 }
2748 fp = (REStackFrame *)(newFP - fFrameSize); // in case of realloc of stack.
2749
2750 // New stack frame = copy of old top frame.
2751 int64_t *source = (int64_t *)fp;
2752 int64_t *dest = newFP;
2753 for (;;) {
2754 *dest++ = *source++;
2755 if (source == newFP) {
2756 break;
2757 }
2758 }
2759
2760 fTickCounter--;
2761 if (fTickCounter <= 0) {
2762 IncrementTime(status); // Re-initializes fTickCounter
2763 }
2764 fp->fPatIdx = savePatIdx;
2765 return (REStackFrame *)newFP;
2766 }
2767
2768 #if defined(REGEX_DEBUG)
2769 namespace {
StringFromUText(UText * ut)2770 UnicodeString StringFromUText(UText *ut) {
2771 UnicodeString result;
2772 for (UChar32 c = utext_next32From(ut, 0); c != U_SENTINEL; c = UTEXT_NEXT32(ut)) {
2773 result.append(c);
2774 }
2775 return result;
2776 }
2777 }
2778 #endif // REGEX_DEBUG
2779
2780
2781 //--------------------------------------------------------------------------------
2782 //
2783 // MatchAt This is the actual matching engine.
2784 //
2785 // startIdx: begin matching a this index.
2786 // toEnd: if true, match must extend to end of the input region
2787 //
2788 //--------------------------------------------------------------------------------
MatchAt(int64_t startIdx,UBool toEnd,UErrorCode & status)2789 void RegexMatcher::MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status) {
2790 UBool isMatch = FALSE; // True if the we have a match.
2791
2792 int64_t backSearchIndex = U_INT64_MAX; // used after greedy single-character matches for searching backwards
2793
2794 int32_t op; // Operation from the compiled pattern, split into
2795 int32_t opType; // the opcode
2796 int32_t opValue; // and the operand value.
2797
2798 #ifdef REGEX_RUN_DEBUG
2799 if (fTraceDebug) {
2800 printf("MatchAt(startIdx=%ld)\n", startIdx);
2801 printf("Original Pattern: \"%s\"\n", CStr(StringFromUText(fPattern->fPattern))());
2802 printf("Input String: \"%s\"\n\n", CStr(StringFromUText(fInputText))());
2803 }
2804 #endif
2805
2806 if (U_FAILURE(status)) {
2807 return;
2808 }
2809
2810 // Cache frequently referenced items from the compiled pattern
2811 //
2812 int64_t *pat = fPattern->fCompiledPat->getBuffer();
2813
2814 const UChar *litText = fPattern->fLiteralText.getBuffer();
2815 UVector *fSets = fPattern->fSets;
2816
2817 fFrameSize = fPattern->fFrameSize;
2818 REStackFrame *fp = resetStack();
2819 if (U_FAILURE(fDeferredStatus)) {
2820 status = fDeferredStatus;
2821 return;
2822 }
2823
2824 fp->fPatIdx = 0;
2825 fp->fInputIdx = startIdx;
2826
2827 // Zero out the pattern's static data
2828 int32_t i;
2829 for (i = 0; i<fPattern->fDataSize; i++) {
2830 fData[i] = 0;
2831 }
2832
2833 //
2834 // Main loop for interpreting the compiled pattern.
2835 // One iteration of the loop per pattern operation performed.
2836 //
2837 for (;;) {
2838 op = (int32_t)pat[fp->fPatIdx];
2839 opType = URX_TYPE(op);
2840 opValue = URX_VAL(op);
2841 #ifdef REGEX_RUN_DEBUG
2842 if (fTraceDebug) {
2843 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2844 printf("inputIdx=%ld inputChar=%x sp=%3ld activeLimit=%ld ", fp->fInputIdx,
2845 UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(), fActiveLimit);
2846 fPattern->dumpOp(fp->fPatIdx);
2847 }
2848 #endif
2849 fp->fPatIdx++;
2850
2851 switch (opType) {
2852
2853
2854 case URX_NOP:
2855 break;
2856
2857
2858 case URX_BACKTRACK:
2859 // Force a backtrack. In some circumstances, the pattern compiler
2860 // will notice that the pattern can't possibly match anything, and will
2861 // emit one of these at that point.
2862 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2863 break;
2864
2865
2866 case URX_ONECHAR:
2867 if (fp->fInputIdx < fActiveLimit) {
2868 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2869 UChar32 c = UTEXT_NEXT32(fInputText);
2870 if (c == opValue) {
2871 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
2872 break;
2873 }
2874 } else {
2875 fHitEnd = TRUE;
2876 }
2877 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2878 break;
2879
2880
2881 case URX_STRING:
2882 {
2883 // Test input against a literal string.
2884 // Strings require two slots in the compiled pattern, one for the
2885 // offset to the string text, and one for the length.
2886
2887 int32_t stringStartIdx = opValue;
2888 op = (int32_t)pat[fp->fPatIdx]; // Fetch the second operand
2889 fp->fPatIdx++;
2890 opType = URX_TYPE(op);
2891 int32_t stringLen = URX_VAL(op);
2892 U_ASSERT(opType == URX_STRING_LEN);
2893 U_ASSERT(stringLen >= 2);
2894
2895 const UChar *patternString = litText+stringStartIdx;
2896 int32_t patternStringIndex = 0;
2897 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2898 UChar32 inputChar;
2899 UChar32 patternChar;
2900 UBool success = TRUE;
2901 while (patternStringIndex < stringLen) {
2902 if (UTEXT_GETNATIVEINDEX(fInputText) >= fActiveLimit) {
2903 success = FALSE;
2904 fHitEnd = TRUE;
2905 break;
2906 }
2907 inputChar = UTEXT_NEXT32(fInputText);
2908 U16_NEXT(patternString, patternStringIndex, stringLen, patternChar);
2909 if (patternChar != inputChar) {
2910 success = FALSE;
2911 break;
2912 }
2913 }
2914
2915 if (success) {
2916 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
2917 } else {
2918 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2919 }
2920 }
2921 break;
2922
2923
2924 case URX_STATE_SAVE:
2925 fp = StateSave(fp, opValue, status);
2926 break;
2927
2928
2929 case URX_END:
2930 // The match loop will exit via this path on a successful match,
2931 // when we reach the end of the pattern.
2932 if (toEnd && fp->fInputIdx != fActiveLimit) {
2933 // The pattern matched, but not to the end of input. Try some more.
2934 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2935 break;
2936 }
2937 isMatch = TRUE;
2938 goto breakFromLoop;
2939
2940 // Start and End Capture stack frame variables are laid out out like this:
2941 // fp->fExtra[opValue] - The start of a completed capture group
2942 // opValue+1 - The end of a completed capture group
2943 // opValue+2 - the start of a capture group whose end
2944 // has not yet been reached (and might not ever be).
2945 case URX_START_CAPTURE:
2946 U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
2947 fp->fExtra[opValue+2] = fp->fInputIdx;
2948 break;
2949
2950
2951 case URX_END_CAPTURE:
2952 U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
2953 U_ASSERT(fp->fExtra[opValue+2] >= 0); // Start pos for this group must be set.
2954 fp->fExtra[opValue] = fp->fExtra[opValue+2]; // Tentative start becomes real.
2955 fp->fExtra[opValue+1] = fp->fInputIdx; // End position
2956 U_ASSERT(fp->fExtra[opValue] <= fp->fExtra[opValue+1]);
2957 break;
2958
2959
2960 case URX_DOLLAR: // $, test for End of line
2961 // or for position before new line at end of input
2962 {
2963 if (fp->fInputIdx >= fAnchorLimit) {
2964 // We really are at the end of input. Success.
2965 fHitEnd = TRUE;
2966 fRequireEnd = TRUE;
2967 break;
2968 }
2969
2970 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2971
2972 // If we are positioned just before a new-line that is located at the
2973 // end of input, succeed.
2974 UChar32 c = UTEXT_NEXT32(fInputText);
2975 if (UTEXT_GETNATIVEINDEX(fInputText) >= fAnchorLimit) {
2976 if (isLineTerminator(c)) {
2977 // If not in the middle of a CR/LF sequence
2978 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && ((void)UTEXT_PREVIOUS32(fInputText), UTEXT_PREVIOUS32(fInputText))==0x0d)) {
2979 // At new-line at end of input. Success
2980 fHitEnd = TRUE;
2981 fRequireEnd = TRUE;
2982
2983 break;
2984 }
2985 }
2986 } else {
2987 UChar32 nextC = UTEXT_NEXT32(fInputText);
2988 if (c == 0x0d && nextC == 0x0a && UTEXT_GETNATIVEINDEX(fInputText) >= fAnchorLimit) {
2989 fHitEnd = TRUE;
2990 fRequireEnd = TRUE;
2991 break; // At CR/LF at end of input. Success
2992 }
2993 }
2994
2995 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2996 }
2997 break;
2998
2999
3000 case URX_DOLLAR_D: // $, test for End of Line, in UNIX_LINES mode.
3001 if (fp->fInputIdx >= fAnchorLimit) {
3002 // Off the end of input. Success.
3003 fHitEnd = TRUE;
3004 fRequireEnd = TRUE;
3005 break;
3006 } else {
3007 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3008 UChar32 c = UTEXT_NEXT32(fInputText);
3009 // Either at the last character of input, or off the end.
3010 if (c == 0x0a && UTEXT_GETNATIVEINDEX(fInputText) == fAnchorLimit) {
3011 fHitEnd = TRUE;
3012 fRequireEnd = TRUE;
3013 break;
3014 }
3015 }
3016
3017 // Not at end of input. Back-track out.
3018 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3019 break;
3020
3021
3022 case URX_DOLLAR_M: // $, test for End of line in multi-line mode
3023 {
3024 if (fp->fInputIdx >= fAnchorLimit) {
3025 // We really are at the end of input. Success.
3026 fHitEnd = TRUE;
3027 fRequireEnd = TRUE;
3028 break;
3029 }
3030 // If we are positioned just before a new-line, succeed.
3031 // It makes no difference where the new-line is within the input.
3032 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3033 UChar32 c = UTEXT_CURRENT32(fInputText);
3034 if (isLineTerminator(c)) {
3035 // At a line end, except for the odd chance of being in the middle of a CR/LF sequence
3036 // In multi-line mode, hitting a new-line just before the end of input does not
3037 // set the hitEnd or requireEnd flags
3038 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && UTEXT_PREVIOUS32(fInputText)==0x0d)) {
3039 break;
3040 }
3041 }
3042 // not at a new line. Fail.
3043 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3044 }
3045 break;
3046
3047
3048 case URX_DOLLAR_MD: // $, test for End of line in multi-line and UNIX_LINES mode
3049 {
3050 if (fp->fInputIdx >= fAnchorLimit) {
3051 // We really are at the end of input. Success.
3052 fHitEnd = TRUE;
3053 fRequireEnd = TRUE; // Java set requireEnd in this case, even though
3054 break; // adding a new-line would not lose the match.
3055 }
3056 // If we are not positioned just before a new-line, the test fails; backtrack out.
3057 // It makes no difference where the new-line is within the input.
3058 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3059 if (UTEXT_CURRENT32(fInputText) != 0x0a) {
3060 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3061 }
3062 }
3063 break;
3064
3065
3066 case URX_CARET: // ^, test for start of line
3067 if (fp->fInputIdx != fAnchorStart) {
3068 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3069 }
3070 break;
3071
3072
3073 case URX_CARET_M: // ^, test for start of line in mulit-line mode
3074 {
3075 if (fp->fInputIdx == fAnchorStart) {
3076 // We are at the start input. Success.
3077 break;
3078 }
3079 // Check whether character just before the current pos is a new-line
3080 // unless we are at the end of input
3081 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3082 UChar32 c = UTEXT_PREVIOUS32(fInputText);
3083 if ((fp->fInputIdx < fAnchorLimit) && isLineTerminator(c)) {
3084 // It's a new-line. ^ is true. Success.
3085 // TODO: what should be done with positions between a CR and LF?
3086 break;
3087 }
3088 // Not at the start of a line. Fail.
3089 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3090 }
3091 break;
3092
3093
3094 case URX_CARET_M_UNIX: // ^, test for start of line in mulit-line + Unix-line mode
3095 {
3096 U_ASSERT(fp->fInputIdx >= fAnchorStart);
3097 if (fp->fInputIdx <= fAnchorStart) {
3098 // We are at the start input. Success.
3099 break;
3100 }
3101 // Check whether character just before the current pos is a new-line
3102 U_ASSERT(fp->fInputIdx <= fAnchorLimit);
3103 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3104 UChar32 c = UTEXT_PREVIOUS32(fInputText);
3105 if (c != 0x0a) {
3106 // Not at the start of a line. Back-track out.
3107 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3108 }
3109 }
3110 break;
3111
3112 case URX_BACKSLASH_B: // Test for word boundaries
3113 {
3114 UBool success = isWordBoundary(fp->fInputIdx);
3115 success ^= (UBool)(opValue != 0); // flip sense for \B
3116 if (!success) {
3117 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3118 }
3119 }
3120 break;
3121
3122
3123 case URX_BACKSLASH_BU: // Test for word boundaries, Unicode-style
3124 {
3125 UBool success = isUWordBoundary(fp->fInputIdx, status);
3126 success ^= (UBool)(opValue != 0); // flip sense for \B
3127 if (!success) {
3128 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3129 }
3130 }
3131 break;
3132
3133
3134 case URX_BACKSLASH_D: // Test for decimal digit
3135 {
3136 if (fp->fInputIdx >= fActiveLimit) {
3137 fHitEnd = TRUE;
3138 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3139 break;
3140 }
3141
3142 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3143
3144 UChar32 c = UTEXT_NEXT32(fInputText);
3145 int8_t ctype = u_charType(c); // TODO: make a unicode set for this. Will be faster.
3146 UBool success = (ctype == U_DECIMAL_DIGIT_NUMBER);
3147 success ^= (UBool)(opValue != 0); // flip sense for \D
3148 if (success) {
3149 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3150 } else {
3151 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3152 }
3153 }
3154 break;
3155
3156
3157 case URX_BACKSLASH_G: // Test for position at end of previous match
3158 if (!((fMatch && fp->fInputIdx==fMatchEnd) || (fMatch==FALSE && fp->fInputIdx==fActiveStart))) {
3159 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3160 }
3161 break;
3162
3163
3164 case URX_BACKSLASH_H: // Test for \h, horizontal white space.
3165 {
3166 if (fp->fInputIdx >= fActiveLimit) {
3167 fHitEnd = TRUE;
3168 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3169 break;
3170 }
3171 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3172 UChar32 c = UTEXT_NEXT32(fInputText);
3173 int8_t ctype = u_charType(c);
3174 UBool success = (ctype == U_SPACE_SEPARATOR || c == 9); // SPACE_SEPARATOR || TAB
3175 success ^= (UBool)(opValue != 0); // flip sense for \H
3176 if (success) {
3177 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3178 } else {
3179 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3180 }
3181 }
3182 break;
3183
3184
3185 case URX_BACKSLASH_R: // Test for \R, any line break sequence.
3186 {
3187 if (fp->fInputIdx >= fActiveLimit) {
3188 fHitEnd = TRUE;
3189 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3190 break;
3191 }
3192 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3193 UChar32 c = UTEXT_NEXT32(fInputText);
3194 if (isLineTerminator(c)) {
3195 if (c == 0x0d && utext_current32(fInputText) == 0x0a) {
3196 utext_next32(fInputText);
3197 }
3198 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3199 } else {
3200 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3201 }
3202 }
3203 break;
3204
3205
3206 case URX_BACKSLASH_V: // \v, any single line ending character.
3207 {
3208 if (fp->fInputIdx >= fActiveLimit) {
3209 fHitEnd = TRUE;
3210 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3211 break;
3212 }
3213 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3214 UChar32 c = UTEXT_NEXT32(fInputText);
3215 UBool success = isLineTerminator(c);
3216 success ^= (UBool)(opValue != 0); // flip sense for \V
3217 if (success) {
3218 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3219 } else {
3220 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3221 }
3222 }
3223 break;
3224
3225
3226 case URX_BACKSLASH_X:
3227 // Match a Grapheme, as defined by Unicode UAX 29.
3228
3229 // Fail if at end of input
3230 if (fp->fInputIdx >= fActiveLimit) {
3231 fHitEnd = TRUE;
3232 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3233 break;
3234 }
3235
3236 fp->fInputIdx = followingGCBoundary(fp->fInputIdx, status);
3237 if (fp->fInputIdx >= fActiveLimit) {
3238 fHitEnd = TRUE;
3239 fp->fInputIdx = fActiveLimit;
3240 }
3241 break;
3242
3243
3244 case URX_BACKSLASH_Z: // Test for end of Input
3245 if (fp->fInputIdx < fAnchorLimit) {
3246 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3247 } else {
3248 fHitEnd = TRUE;
3249 fRequireEnd = TRUE;
3250 }
3251 break;
3252
3253
3254
3255 case URX_STATIC_SETREF:
3256 {
3257 // Test input character against one of the predefined sets
3258 // (Word Characters, for example)
3259 // The high bit of the op value is a flag for the match polarity.
3260 // 0: success if input char is in set.
3261 // 1: success if input char is not in set.
3262 if (fp->fInputIdx >= fActiveLimit) {
3263 fHitEnd = TRUE;
3264 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3265 break;
3266 }
3267
3268 UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET);
3269 opValue &= ~URX_NEG_SET;
3270 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
3271
3272 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3273 UChar32 c = UTEXT_NEXT32(fInputText);
3274 if (c < 256) {
3275 Regex8BitSet &s8 = RegexStaticSets::gStaticSets->fPropSets8[opValue];
3276 if (s8.contains(c)) {
3277 success = !success;
3278 }
3279 } else {
3280 const UnicodeSet &s = RegexStaticSets::gStaticSets->fPropSets[opValue];
3281 if (s.contains(c)) {
3282 success = !success;
3283 }
3284 }
3285 if (success) {
3286 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3287 } else {
3288 // the character wasn't in the set.
3289 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3290 }
3291 }
3292 break;
3293
3294
3295 case URX_STAT_SETREF_N:
3296 {
3297 // Test input character for NOT being a member of one of
3298 // the predefined sets (Word Characters, for example)
3299 if (fp->fInputIdx >= fActiveLimit) {
3300 fHitEnd = TRUE;
3301 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3302 break;
3303 }
3304
3305 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
3306
3307 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3308
3309 UChar32 c = UTEXT_NEXT32(fInputText);
3310 if (c < 256) {
3311 Regex8BitSet &s8 = RegexStaticSets::gStaticSets->fPropSets8[opValue];
3312 if (s8.contains(c) == FALSE) {
3313 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3314 break;
3315 }
3316 } else {
3317 const UnicodeSet &s = RegexStaticSets::gStaticSets->fPropSets[opValue];
3318 if (s.contains(c) == FALSE) {
3319 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3320 break;
3321 }
3322 }
3323 // the character wasn't in the set.
3324 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3325 }
3326 break;
3327
3328
3329 case URX_SETREF:
3330 if (fp->fInputIdx >= fActiveLimit) {
3331 fHitEnd = TRUE;
3332 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3333 break;
3334 } else {
3335 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3336
3337 // There is input left. Pick up one char and test it for set membership.
3338 UChar32 c = UTEXT_NEXT32(fInputText);
3339 U_ASSERT(opValue > 0 && opValue < fSets->size());
3340 if (c<256) {
3341 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
3342 if (s8->contains(c)) {
3343 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3344 break;
3345 }
3346 } else {
3347 UnicodeSet *s = (UnicodeSet *)fSets->elementAt(opValue);
3348 if (s->contains(c)) {
3349 // The character is in the set. A Match.
3350 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3351 break;
3352 }
3353 }
3354
3355 // the character wasn't in the set.
3356 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3357 }
3358 break;
3359
3360
3361 case URX_DOTANY:
3362 {
3363 // . matches anything, but stops at end-of-line.
3364 if (fp->fInputIdx >= fActiveLimit) {
3365 // At end of input. Match failed. Backtrack out.
3366 fHitEnd = TRUE;
3367 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3368 break;
3369 }
3370
3371 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3372
3373 // There is input left. Advance over one char, unless we've hit end-of-line
3374 UChar32 c = UTEXT_NEXT32(fInputText);
3375 if (isLineTerminator(c)) {
3376 // End of line in normal mode. . does not match.
3377 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3378 break;
3379 }
3380 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3381 }
3382 break;
3383
3384
3385 case URX_DOTANY_ALL:
3386 {
3387 // ., in dot-matches-all (including new lines) mode
3388 if (fp->fInputIdx >= fActiveLimit) {
3389 // At end of input. Match failed. Backtrack out.
3390 fHitEnd = TRUE;
3391 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3392 break;
3393 }
3394
3395 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3396
3397 // There is input left. Advance over one char, except if we are
3398 // at a cr/lf, advance over both of them.
3399 UChar32 c;
3400 c = UTEXT_NEXT32(fInputText);
3401 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3402 if (c==0x0d && fp->fInputIdx < fActiveLimit) {
3403 // In the case of a CR/LF, we need to advance over both.
3404 UChar32 nextc = UTEXT_CURRENT32(fInputText);
3405 if (nextc == 0x0a) {
3406 (void)UTEXT_NEXT32(fInputText);
3407 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3408 }
3409 }
3410 }
3411 break;
3412
3413
3414 case URX_DOTANY_UNIX:
3415 {
3416 // '.' operator, matches all, but stops at end-of-line.
3417 // UNIX_LINES mode, so 0x0a is the only recognized line ending.
3418 if (fp->fInputIdx >= fActiveLimit) {
3419 // At end of input. Match failed. Backtrack out.
3420 fHitEnd = TRUE;
3421 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3422 break;
3423 }
3424
3425 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3426
3427 // There is input left. Advance over one char, unless we've hit end-of-line
3428 UChar32 c = UTEXT_NEXT32(fInputText);
3429 if (c == 0x0a) {
3430 // End of line in normal mode. '.' does not match the \n
3431 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3432 } else {
3433 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3434 }
3435 }
3436 break;
3437
3438
3439 case URX_JMP:
3440 fp->fPatIdx = opValue;
3441 break;
3442
3443 case URX_FAIL:
3444 isMatch = FALSE;
3445 goto breakFromLoop;
3446
3447 case URX_JMP_SAV:
3448 U_ASSERT(opValue < fPattern->fCompiledPat->size());
3449 fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current
3450 fp->fPatIdx = opValue; // Then JMP.
3451 break;
3452
3453 case URX_JMP_SAV_X:
3454 // This opcode is used with (x)+, when x can match a zero length string.
3455 // Same as JMP_SAV, except conditional on the match having made forward progress.
3456 // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the
3457 // data address of the input position at the start of the loop.
3458 {
3459 U_ASSERT(opValue > 0 && opValue < fPattern->fCompiledPat->size());
3460 int32_t stoOp = (int32_t)pat[opValue-1];
3461 U_ASSERT(URX_TYPE(stoOp) == URX_STO_INP_LOC);
3462 int32_t frameLoc = URX_VAL(stoOp);
3463 U_ASSERT(frameLoc >= 0 && frameLoc < fFrameSize);
3464 int64_t prevInputIdx = fp->fExtra[frameLoc];
3465 U_ASSERT(prevInputIdx <= fp->fInputIdx);
3466 if (prevInputIdx < fp->fInputIdx) {
3467 // The match did make progress. Repeat the loop.
3468 fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current
3469 fp->fPatIdx = opValue;
3470 fp->fExtra[frameLoc] = fp->fInputIdx;
3471 }
3472 // If the input position did not advance, we do nothing here,
3473 // execution will fall out of the loop.
3474 }
3475 break;
3476
3477 case URX_CTR_INIT:
3478 {
3479 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
3480 fp->fExtra[opValue] = 0; // Set the loop counter variable to zero
3481
3482 // Pick up the three extra operands that CTR_INIT has, and
3483 // skip the pattern location counter past
3484 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
3485 fp->fPatIdx += 3;
3486 int32_t loopLoc = URX_VAL(pat[instrOperandLoc]);
3487 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
3488 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
3489 U_ASSERT(minCount>=0);
3490 U_ASSERT(maxCount>=minCount || maxCount==-1);
3491 U_ASSERT(loopLoc>=fp->fPatIdx);
3492
3493 if (minCount == 0) {
3494 fp = StateSave(fp, loopLoc+1, status);
3495 }
3496 if (maxCount == -1) {
3497 fp->fExtra[opValue+1] = fp->fInputIdx; // For loop breaking.
3498 } else if (maxCount == 0) {
3499 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3500 }
3501 }
3502 break;
3503
3504 case URX_CTR_LOOP:
3505 {
3506 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
3507 int32_t initOp = (int32_t)pat[opValue];
3508 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT);
3509 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
3510 int32_t minCount = (int32_t)pat[opValue+2];
3511 int32_t maxCount = (int32_t)pat[opValue+3];
3512 (*pCounter)++;
3513 if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) {
3514 U_ASSERT(*pCounter == maxCount);
3515 break;
3516 }
3517 if (*pCounter >= minCount) {
3518 if (maxCount == -1) {
3519 // Loop has no hard upper bound.
3520 // Check that it is progressing through the input, break if it is not.
3521 int64_t *pLastInputIdx = &fp->fExtra[URX_VAL(initOp) + 1];
3522 if (fp->fInputIdx == *pLastInputIdx) {
3523 break;
3524 } else {
3525 *pLastInputIdx = fp->fInputIdx;
3526 }
3527 }
3528 fp = StateSave(fp, fp->fPatIdx, status);
3529 } else {
3530 // Increment time-out counter. (StateSave() does it if count >= minCount)
3531 fTickCounter--;
3532 if (fTickCounter <= 0) {
3533 IncrementTime(status); // Re-initializes fTickCounter
3534 }
3535 }
3536
3537 fp->fPatIdx = opValue + 4; // Loop back.
3538 }
3539 break;
3540
3541 case URX_CTR_INIT_NG:
3542 {
3543 // Initialize a non-greedy loop
3544 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
3545 fp->fExtra[opValue] = 0; // Set the loop counter variable to zero
3546
3547 // Pick up the three extra operands that CTR_INIT_NG has, and
3548 // skip the pattern location counter past
3549 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
3550 fp->fPatIdx += 3;
3551 int32_t loopLoc = URX_VAL(pat[instrOperandLoc]);
3552 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
3553 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
3554 U_ASSERT(minCount>=0);
3555 U_ASSERT(maxCount>=minCount || maxCount==-1);
3556 U_ASSERT(loopLoc>fp->fPatIdx);
3557 if (maxCount == -1) {
3558 fp->fExtra[opValue+1] = fp->fInputIdx; // Save initial input index for loop breaking.
3559 }
3560
3561 if (minCount == 0) {
3562 if (maxCount != 0) {
3563 fp = StateSave(fp, fp->fPatIdx, status);
3564 }
3565 fp->fPatIdx = loopLoc+1; // Continue with stuff after repeated block
3566 }
3567 }
3568 break;
3569
3570 case URX_CTR_LOOP_NG:
3571 {
3572 // Non-greedy {min, max} loops
3573 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
3574 int32_t initOp = (int32_t)pat[opValue];
3575 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT_NG);
3576 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
3577 int32_t minCount = (int32_t)pat[opValue+2];
3578 int32_t maxCount = (int32_t)pat[opValue+3];
3579
3580 (*pCounter)++;
3581 if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) {
3582 // The loop has matched the maximum permitted number of times.
3583 // Break out of here with no action. Matching will
3584 // continue with the following pattern.
3585 U_ASSERT(*pCounter == maxCount);
3586 break;
3587 }
3588
3589 if (*pCounter < minCount) {
3590 // We haven't met the minimum number of matches yet.
3591 // Loop back for another one.
3592 fp->fPatIdx = opValue + 4; // Loop back.
3593 // Increment time-out counter. (StateSave() does it if count >= minCount)
3594 fTickCounter--;
3595 if (fTickCounter <= 0) {
3596 IncrementTime(status); // Re-initializes fTickCounter
3597 }
3598 } else {
3599 // We do have the minimum number of matches.
3600
3601 // If there is no upper bound on the loop iterations, check that the input index
3602 // is progressing, and stop the loop if it is not.
3603 if (maxCount == -1) {
3604 int64_t *pLastInputIdx = &fp->fExtra[URX_VAL(initOp) + 1];
3605 if (fp->fInputIdx == *pLastInputIdx) {
3606 break;
3607 }
3608 *pLastInputIdx = fp->fInputIdx;
3609 }
3610
3611 // Loop Continuation: we will fall into the pattern following the loop
3612 // (non-greedy, don't execute loop body first), but first do
3613 // a state save to the top of the loop, so that a match failure
3614 // in the following pattern will try another iteration of the loop.
3615 fp = StateSave(fp, opValue + 4, status);
3616 }
3617 }
3618 break;
3619
3620 case URX_STO_SP:
3621 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
3622 fData[opValue] = fStack->size();
3623 break;
3624
3625 case URX_LD_SP:
3626 {
3627 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
3628 int32_t newStackSize = (int32_t)fData[opValue];
3629 U_ASSERT(newStackSize <= fStack->size());
3630 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
3631 if (newFP == (int64_t *)fp) {
3632 break;
3633 }
3634 int32_t j;
3635 for (j=0; j<fFrameSize; j++) {
3636 newFP[j] = ((int64_t *)fp)[j];
3637 }
3638 fp = (REStackFrame *)newFP;
3639 fStack->setSize(newStackSize);
3640 }
3641 break;
3642
3643 case URX_BACKREF:
3644 {
3645 U_ASSERT(opValue < fFrameSize);
3646 int64_t groupStartIdx = fp->fExtra[opValue];
3647 int64_t groupEndIdx = fp->fExtra[opValue+1];
3648 U_ASSERT(groupStartIdx <= groupEndIdx);
3649 if (groupStartIdx < 0) {
3650 // This capture group has not participated in the match thus far,
3651 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match.
3652 break;
3653 }
3654 UTEXT_SETNATIVEINDEX(fAltInputText, groupStartIdx);
3655 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3656
3657 // Note: if the capture group match was of an empty string the backref
3658 // match succeeds. Verified by testing: Perl matches succeed
3659 // in this case, so we do too.
3660
3661 UBool success = TRUE;
3662 for (;;) {
3663 if (utext_getNativeIndex(fAltInputText) >= groupEndIdx) {
3664 success = TRUE;
3665 break;
3666 }
3667 if (utext_getNativeIndex(fInputText) >= fActiveLimit) {
3668 success = FALSE;
3669 fHitEnd = TRUE;
3670 break;
3671 }
3672 UChar32 captureGroupChar = utext_next32(fAltInputText);
3673 UChar32 inputChar = utext_next32(fInputText);
3674 if (inputChar != captureGroupChar) {
3675 success = FALSE;
3676 break;
3677 }
3678 }
3679
3680 if (success) {
3681 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3682 } else {
3683 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3684 }
3685 }
3686 break;
3687
3688
3689
3690 case URX_BACKREF_I:
3691 {
3692 U_ASSERT(opValue < fFrameSize);
3693 int64_t groupStartIdx = fp->fExtra[opValue];
3694 int64_t groupEndIdx = fp->fExtra[opValue+1];
3695 U_ASSERT(groupStartIdx <= groupEndIdx);
3696 if (groupStartIdx < 0) {
3697 // This capture group has not participated in the match thus far,
3698 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match.
3699 break;
3700 }
3701 utext_setNativeIndex(fAltInputText, groupStartIdx);
3702 utext_setNativeIndex(fInputText, fp->fInputIdx);
3703 CaseFoldingUTextIterator captureGroupItr(*fAltInputText);
3704 CaseFoldingUTextIterator inputItr(*fInputText);
3705
3706 // Note: if the capture group match was of an empty string the backref
3707 // match succeeds. Verified by testing: Perl matches succeed
3708 // in this case, so we do too.
3709
3710 UBool success = TRUE;
3711 for (;;) {
3712 if (!captureGroupItr.inExpansion() && utext_getNativeIndex(fAltInputText) >= groupEndIdx) {
3713 success = TRUE;
3714 break;
3715 }
3716 if (!inputItr.inExpansion() && utext_getNativeIndex(fInputText) >= fActiveLimit) {
3717 success = FALSE;
3718 fHitEnd = TRUE;
3719 break;
3720 }
3721 UChar32 captureGroupChar = captureGroupItr.next();
3722 UChar32 inputChar = inputItr.next();
3723 if (inputChar != captureGroupChar) {
3724 success = FALSE;
3725 break;
3726 }
3727 }
3728
3729 if (success && inputItr.inExpansion()) {
3730 // We obtained a match by consuming part of a string obtained from
3731 // case-folding a single code point of the input text.
3732 // This does not count as an overall match.
3733 success = FALSE;
3734 }
3735
3736 if (success) {
3737 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3738 } else {
3739 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3740 }
3741
3742 }
3743 break;
3744
3745 case URX_STO_INP_LOC:
3746 {
3747 U_ASSERT(opValue >= 0 && opValue < fFrameSize);
3748 fp->fExtra[opValue] = fp->fInputIdx;
3749 }
3750 break;
3751
3752 case URX_JMPX:
3753 {
3754 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
3755 fp->fPatIdx += 1;
3756 int32_t dataLoc = URX_VAL(pat[instrOperandLoc]);
3757 U_ASSERT(dataLoc >= 0 && dataLoc < fFrameSize);
3758 int64_t savedInputIdx = fp->fExtra[dataLoc];
3759 U_ASSERT(savedInputIdx <= fp->fInputIdx);
3760 if (savedInputIdx < fp->fInputIdx) {
3761 fp->fPatIdx = opValue; // JMP
3762 } else {
3763 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no progress in loop.
3764 }
3765 }
3766 break;
3767
3768 case URX_LA_START:
3769 {
3770 // Entering a look around block.
3771 // Save Stack Ptr, Input Pos.
3772 U_ASSERT(opValue>=0 && opValue+3<fPattern->fDataSize);
3773 fData[opValue] = fStack->size();
3774 fData[opValue+1] = fp->fInputIdx;
3775 fData[opValue+2] = fActiveStart;
3776 fData[opValue+3] = fActiveLimit;
3777 fActiveStart = fLookStart; // Set the match region change for
3778 fActiveLimit = fLookLimit; // transparent bounds.
3779 }
3780 break;
3781
3782 case URX_LA_END:
3783 {
3784 // Leaving a look-ahead block.
3785 // restore Stack Ptr, Input Pos to positions they had on entry to block.
3786 U_ASSERT(opValue>=0 && opValue+3<fPattern->fDataSize);
3787 int32_t stackSize = fStack->size();
3788 int32_t newStackSize =(int32_t)fData[opValue];
3789 U_ASSERT(stackSize >= newStackSize);
3790 if (stackSize > newStackSize) {
3791 // Copy the current top frame back to the new (cut back) top frame.
3792 // This makes the capture groups from within the look-ahead
3793 // expression available.
3794 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
3795 int32_t j;
3796 for (j=0; j<fFrameSize; j++) {
3797 newFP[j] = ((int64_t *)fp)[j];
3798 }
3799 fp = (REStackFrame *)newFP;
3800 fStack->setSize(newStackSize);
3801 }
3802 fp->fInputIdx = fData[opValue+1];
3803
3804 // Restore the active region bounds in the input string; they may have
3805 // been changed because of transparent bounds on a Region.
3806 fActiveStart = fData[opValue+2];
3807 fActiveLimit = fData[opValue+3];
3808 U_ASSERT(fActiveStart >= 0);
3809 U_ASSERT(fActiveLimit <= fInputLength);
3810 }
3811 break;
3812
3813 case URX_ONECHAR_I:
3814 // Case insensitive one char. The char from the pattern is already case folded.
3815 // Input text is not, but case folding the input can not reduce two or more code
3816 // points to one.
3817 if (fp->fInputIdx < fActiveLimit) {
3818 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3819
3820 UChar32 c = UTEXT_NEXT32(fInputText);
3821 if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) {
3822 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3823 break;
3824 }
3825 } else {
3826 fHitEnd = TRUE;
3827 }
3828
3829 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3830 break;
3831
3832 case URX_STRING_I:
3833 {
3834 // Case-insensitive test input against a literal string.
3835 // Strings require two slots in the compiled pattern, one for the
3836 // offset to the string text, and one for the length.
3837 // The compiled string has already been case folded.
3838 {
3839 const UChar *patternString = litText + opValue;
3840 int32_t patternStringIdx = 0;
3841
3842 op = (int32_t)pat[fp->fPatIdx];
3843 fp->fPatIdx++;
3844 opType = URX_TYPE(op);
3845 opValue = URX_VAL(op);
3846 U_ASSERT(opType == URX_STRING_LEN);
3847 int32_t patternStringLen = opValue; // Length of the string from the pattern.
3848
3849
3850 UChar32 cPattern;
3851 UChar32 cText;
3852 UBool success = TRUE;
3853
3854 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3855 CaseFoldingUTextIterator inputIterator(*fInputText);
3856 while (patternStringIdx < patternStringLen) {
3857 if (!inputIterator.inExpansion() && UTEXT_GETNATIVEINDEX(fInputText) >= fActiveLimit) {
3858 success = FALSE;
3859 fHitEnd = TRUE;
3860 break;
3861 }
3862 U16_NEXT(patternString, patternStringIdx, patternStringLen, cPattern);
3863 cText = inputIterator.next();
3864 if (cText != cPattern) {
3865 success = FALSE;
3866 break;
3867 }
3868 }
3869 if (inputIterator.inExpansion()) {
3870 success = FALSE;
3871 }
3872
3873 if (success) {
3874 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3875 } else {
3876 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3877 }
3878 }
3879 }
3880 break;
3881
3882 case URX_LB_START:
3883 {
3884 // Entering a look-behind block.
3885 // Save Stack Ptr, Input Pos and active input region.
3886 // TODO: implement transparent bounds. Ticket #6067
3887 U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize);
3888 fData[opValue] = fStack->size();
3889 fData[opValue+1] = fp->fInputIdx;
3890 // Save input string length, then reset to pin any matches to end at
3891 // the current position.
3892 fData[opValue+2] = fActiveStart;
3893 fData[opValue+3] = fActiveLimit;
3894 fActiveStart = fRegionStart;
3895 fActiveLimit = fp->fInputIdx;
3896 // Init the variable containing the start index for attempted matches.
3897 fData[opValue+4] = -1;
3898 }
3899 break;
3900
3901
3902 case URX_LB_CONT:
3903 {
3904 // Positive Look-Behind, at top of loop checking for matches of LB expression
3905 // at all possible input starting positions.
3906
3907 // Fetch the min and max possible match lengths. They are the operands
3908 // of this op in the pattern.
3909 int32_t minML = (int32_t)pat[fp->fPatIdx++];
3910 int32_t maxML = (int32_t)pat[fp->fPatIdx++];
3911 if (!UTEXT_USES_U16(fInputText)) {
3912 // utf-8 fix to maximum match length. The pattern compiler assumes utf-16.
3913 // The max length need not be exact; it just needs to be >= actual maximum.
3914 maxML *= 3;
3915 }
3916 U_ASSERT(minML <= maxML);
3917 U_ASSERT(minML >= 0);
3918
3919 // Fetch (from data) the last input index where a match was attempted.
3920 U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize);
3921 int64_t &lbStartIdx = fData[opValue+4];
3922 if (lbStartIdx < 0) {
3923 // First time through loop.
3924 lbStartIdx = fp->fInputIdx - minML;
3925 if (lbStartIdx > 0) {
3926 // move index to a code point boundary, if it's not on one already.
3927 UTEXT_SETNATIVEINDEX(fInputText, lbStartIdx);
3928 lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
3929 }
3930 } else {
3931 // 2nd through nth time through the loop.
3932 // Back up start position for match by one.
3933 if (lbStartIdx == 0) {
3934 (lbStartIdx)--;
3935 } else {
3936 UTEXT_SETNATIVEINDEX(fInputText, lbStartIdx);
3937 (void)UTEXT_PREVIOUS32(fInputText);
3938 lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
3939 }
3940 }
3941
3942 if (lbStartIdx < 0 || lbStartIdx < fp->fInputIdx - maxML) {
3943 // We have tried all potential match starting points without
3944 // getting a match. Backtrack out, and out of the
3945 // Look Behind altogether.
3946 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3947 fActiveStart = fData[opValue+2];
3948 fActiveLimit = fData[opValue+3];
3949 U_ASSERT(fActiveStart >= 0);
3950 U_ASSERT(fActiveLimit <= fInputLength);
3951 break;
3952 }
3953
3954 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
3955 // (successful match will fall off the end of the loop.)
3956 fp = StateSave(fp, fp->fPatIdx-3, status);
3957 fp->fInputIdx = lbStartIdx;
3958 }
3959 break;
3960
3961 case URX_LB_END:
3962 // End of a look-behind block, after a successful match.
3963 {
3964 U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize);
3965 if (fp->fInputIdx != fActiveLimit) {
3966 // The look-behind expression matched, but the match did not
3967 // extend all the way to the point that we are looking behind from.
3968 // FAIL out of here, which will take us back to the LB_CONT, which
3969 // will retry the match starting at another position or fail
3970 // the look-behind altogether, whichever is appropriate.
3971 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3972 break;
3973 }
3974
3975 // Look-behind match is good. Restore the original input string region,
3976 // which had been truncated to pin the end of the lookbehind match to the
3977 // position being looked-behind.
3978 fActiveStart = fData[opValue+2];
3979 fActiveLimit = fData[opValue+3];
3980 U_ASSERT(fActiveStart >= 0);
3981 U_ASSERT(fActiveLimit <= fInputLength);
3982 }
3983 break;
3984
3985
3986 case URX_LBN_CONT:
3987 {
3988 // Negative Look-Behind, at top of loop checking for matches of LB expression
3989 // at all possible input starting positions.
3990
3991 // Fetch the extra parameters of this op.
3992 int32_t minML = (int32_t)pat[fp->fPatIdx++];
3993 int32_t maxML = (int32_t)pat[fp->fPatIdx++];
3994 if (!UTEXT_USES_U16(fInputText)) {
3995 // utf-8 fix to maximum match length. The pattern compiler assumes utf-16.
3996 // The max length need not be exact; it just needs to be >= actual maximum.
3997 maxML *= 3;
3998 }
3999 int32_t continueLoc = (int32_t)pat[fp->fPatIdx++];
4000 continueLoc = URX_VAL(continueLoc);
4001 U_ASSERT(minML <= maxML);
4002 U_ASSERT(minML >= 0);
4003 U_ASSERT(continueLoc > fp->fPatIdx);
4004
4005 // Fetch (from data) the last input index where a match was attempted.
4006 U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize);
4007 int64_t &lbStartIdx = fData[opValue+4];
4008 if (lbStartIdx < 0) {
4009 // First time through loop.
4010 lbStartIdx = fp->fInputIdx - minML;
4011 if (lbStartIdx > 0) {
4012 // move index to a code point boundary, if it's not on one already.
4013 UTEXT_SETNATIVEINDEX(fInputText, lbStartIdx);
4014 lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
4015 }
4016 } else {
4017 // 2nd through nth time through the loop.
4018 // Back up start position for match by one.
4019 if (lbStartIdx == 0) {
4020 (lbStartIdx)--;
4021 } else {
4022 UTEXT_SETNATIVEINDEX(fInputText, lbStartIdx);
4023 (void)UTEXT_PREVIOUS32(fInputText);
4024 lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
4025 }
4026 }
4027
4028 if (lbStartIdx < 0 || lbStartIdx < fp->fInputIdx - maxML) {
4029 // We have tried all potential match starting points without
4030 // getting a match, which means that the negative lookbehind as
4031 // a whole has succeeded. Jump forward to the continue location
4032 fActiveStart = fData[opValue+2];
4033 fActiveLimit = fData[opValue+3];
4034 U_ASSERT(fActiveStart >= 0);
4035 U_ASSERT(fActiveLimit <= fInputLength);
4036 fp->fPatIdx = continueLoc;
4037 break;
4038 }
4039
4040 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
4041 // (successful match will cause a FAIL out of the loop altogether.)
4042 fp = StateSave(fp, fp->fPatIdx-4, status);
4043 fp->fInputIdx = lbStartIdx;
4044 }
4045 break;
4046
4047 case URX_LBN_END:
4048 // End of a negative look-behind block, after a successful match.
4049 {
4050 U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize);
4051 if (fp->fInputIdx != fActiveLimit) {
4052 // The look-behind expression matched, but the match did not
4053 // extend all the way to the point that we are looking behind from.
4054 // FAIL out of here, which will take us back to the LB_CONT, which
4055 // will retry the match starting at another position or succeed
4056 // the look-behind altogether, whichever is appropriate.
4057 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4058 break;
4059 }
4060
4061 // Look-behind expression matched, which means look-behind test as
4062 // a whole Fails
4063
4064 // Restore the original input string length, which had been truncated
4065 // inorder to pin the end of the lookbehind match
4066 // to the position being looked-behind.
4067 fActiveStart = fData[opValue+2];
4068 fActiveLimit = fData[opValue+3];
4069 U_ASSERT(fActiveStart >= 0);
4070 U_ASSERT(fActiveLimit <= fInputLength);
4071
4072 // Restore original stack position, discarding any state saved
4073 // by the successful pattern match.
4074 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
4075 int32_t newStackSize = (int32_t)fData[opValue];
4076 U_ASSERT(fStack->size() > newStackSize);
4077 fStack->setSize(newStackSize);
4078
4079 // FAIL, which will take control back to someplace
4080 // prior to entering the look-behind test.
4081 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4082 }
4083 break;
4084
4085
4086 case URX_LOOP_SR_I:
4087 // Loop Initialization for the optimized implementation of
4088 // [some character set]*
4089 // This op scans through all matching input.
4090 // The following LOOP_C op emulates stack unwinding if the following pattern fails.
4091 {
4092 U_ASSERT(opValue > 0 && opValue < fSets->size());
4093 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
4094 UnicodeSet *s = (UnicodeSet *)fSets->elementAt(opValue);
4095
4096 // Loop through input, until either the input is exhausted or
4097 // we reach a character that is not a member of the set.
4098 int64_t ix = fp->fInputIdx;
4099 UTEXT_SETNATIVEINDEX(fInputText, ix);
4100 for (;;) {
4101 if (ix >= fActiveLimit) {
4102 fHitEnd = TRUE;
4103 break;
4104 }
4105 UChar32 c = UTEXT_NEXT32(fInputText);
4106 if (c<256) {
4107 if (s8->contains(c) == FALSE) {
4108 break;
4109 }
4110 } else {
4111 if (s->contains(c) == FALSE) {
4112 break;
4113 }
4114 }
4115 ix = UTEXT_GETNATIVEINDEX(fInputText);
4116 }
4117
4118 // If there were no matching characters, skip over the loop altogether.
4119 // The loop doesn't run at all, a * op always succeeds.
4120 if (ix == fp->fInputIdx) {
4121 fp->fPatIdx++; // skip the URX_LOOP_C op.
4122 break;
4123 }
4124
4125 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
4126 // must follow. It's operand is the stack location
4127 // that holds the starting input index for the match of this [set]*
4128 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
4129 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
4130 int32_t stackLoc = URX_VAL(loopcOp);
4131 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
4132 fp->fExtra[stackLoc] = fp->fInputIdx;
4133 fp->fInputIdx = ix;
4134
4135 // Save State to the URX_LOOP_C op that follows this one,
4136 // so that match failures in the following code will return to there.
4137 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
4138 fp = StateSave(fp, fp->fPatIdx, status);
4139 fp->fPatIdx++;
4140 }
4141 break;
4142
4143
4144 case URX_LOOP_DOT_I:
4145 // Loop Initialization for the optimized implementation of .*
4146 // This op scans through all remaining input.
4147 // The following LOOP_C op emulates stack unwinding if the following pattern fails.
4148 {
4149 // Loop through input until the input is exhausted (we reach an end-of-line)
4150 // In DOTALL mode, we can just go straight to the end of the input.
4151 int64_t ix;
4152 if ((opValue & 1) == 1) {
4153 // Dot-matches-All mode. Jump straight to the end of the string.
4154 ix = fActiveLimit;
4155 fHitEnd = TRUE;
4156 } else {
4157 // NOT DOT ALL mode. Line endings do not match '.'
4158 // Scan forward until a line ending or end of input.
4159 ix = fp->fInputIdx;
4160 UTEXT_SETNATIVEINDEX(fInputText, ix);
4161 for (;;) {
4162 if (ix >= fActiveLimit) {
4163 fHitEnd = TRUE;
4164 break;
4165 }
4166 UChar32 c = UTEXT_NEXT32(fInputText);
4167 if ((c & 0x7f) <= 0x29) { // Fast filter of non-new-line-s
4168 if ((c == 0x0a) || // 0x0a is newline in both modes.
4169 (((opValue & 2) == 0) && // IF not UNIX_LINES mode
4170 isLineTerminator(c))) {
4171 // char is a line ending. Exit the scanning loop.
4172 break;
4173 }
4174 }
4175 ix = UTEXT_GETNATIVEINDEX(fInputText);
4176 }
4177 }
4178
4179 // If there were no matching characters, skip over the loop altogether.
4180 // The loop doesn't run at all, a * op always succeeds.
4181 if (ix == fp->fInputIdx) {
4182 fp->fPatIdx++; // skip the URX_LOOP_C op.
4183 break;
4184 }
4185
4186 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
4187 // must follow. It's operand is the stack location
4188 // that holds the starting input index for the match of this .*
4189 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
4190 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
4191 int32_t stackLoc = URX_VAL(loopcOp);
4192 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
4193 fp->fExtra[stackLoc] = fp->fInputIdx;
4194 fp->fInputIdx = ix;
4195
4196 // Save State to the URX_LOOP_C op that follows this one,
4197 // so that match failures in the following code will return to there.
4198 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
4199 fp = StateSave(fp, fp->fPatIdx, status);
4200 fp->fPatIdx++;
4201 }
4202 break;
4203
4204
4205 case URX_LOOP_C:
4206 {
4207 U_ASSERT(opValue>=0 && opValue<fFrameSize);
4208 backSearchIndex = fp->fExtra[opValue];
4209 U_ASSERT(backSearchIndex <= fp->fInputIdx);
4210 if (backSearchIndex == fp->fInputIdx) {
4211 // We've backed up the input idx to the point that the loop started.
4212 // The loop is done. Leave here without saving state.
4213 // Subsequent failures won't come back here.
4214 break;
4215 }
4216 // Set up for the next iteration of the loop, with input index
4217 // backed up by one from the last time through,
4218 // and a state save to this instruction in case the following code fails again.
4219 // (We're going backwards because this loop emulates stack unwinding, not
4220 // the initial scan forward.)
4221 U_ASSERT(fp->fInputIdx > 0);
4222 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
4223 UChar32 prevC = UTEXT_PREVIOUS32(fInputText);
4224 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
4225
4226 UChar32 twoPrevC = UTEXT_PREVIOUS32(fInputText);
4227 if (prevC == 0x0a &&
4228 fp->fInputIdx > backSearchIndex &&
4229 twoPrevC == 0x0d) {
4230 int32_t prevOp = (int32_t)pat[fp->fPatIdx-2];
4231 if (URX_TYPE(prevOp) == URX_LOOP_DOT_I) {
4232 // .*, stepping back over CRLF pair.
4233 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
4234 }
4235 }
4236
4237
4238 fp = StateSave(fp, fp->fPatIdx-1, status);
4239 }
4240 break;
4241
4242
4243
4244 default:
4245 // Trouble. The compiled pattern contains an entry with an
4246 // unrecognized type tag.
4247 UPRV_UNREACHABLE_ASSERT;
4248 // Unknown opcode type in opType = URX_TYPE(pat[fp->fPatIdx]). But we have
4249 // reports of this in production code, don't use UPRV_UNREACHABLE_EXIT.
4250 // See ICU-21669.
4251 status = U_INTERNAL_PROGRAM_ERROR;
4252 }
4253
4254 if (U_FAILURE(status)) {
4255 isMatch = FALSE;
4256 break;
4257 }
4258 }
4259
4260 breakFromLoop:
4261 fMatch = isMatch;
4262 if (isMatch) {
4263 fLastMatchEnd = fMatchEnd;
4264 fMatchStart = startIdx;
4265 fMatchEnd = fp->fInputIdx;
4266 }
4267
4268 #ifdef REGEX_RUN_DEBUG
4269 if (fTraceDebug) {
4270 if (isMatch) {
4271 printf("Match. start=%ld end=%ld\n\n", fMatchStart, fMatchEnd);
4272 } else {
4273 printf("No match\n\n");
4274 }
4275 }
4276 #endif
4277
4278 fFrame = fp; // The active stack frame when the engine stopped.
4279 // Contains the capture group results that we need to
4280 // access later.
4281 return;
4282 }
4283
4284
4285 //--------------------------------------------------------------------------------
4286 //
4287 // MatchChunkAt This is the actual matching engine. Like MatchAt, but with the
4288 // assumption that the entire string is available in the UText's
4289 // chunk buffer. For now, that means we can use int32_t indexes,
4290 // except for anything that needs to be saved (like group starts
4291 // and ends).
4292 //
4293 // startIdx: begin matching a this index.
4294 // toEnd: if true, match must extend to end of the input region
4295 //
4296 //--------------------------------------------------------------------------------
MatchChunkAt(int32_t startIdx,UBool toEnd,UErrorCode & status)4297 void RegexMatcher::MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &status) {
4298 UBool isMatch = FALSE; // True if the we have a match.
4299
4300 int32_t backSearchIndex = INT32_MAX; // used after greedy single-character matches for searching backwards
4301
4302 int32_t op; // Operation from the compiled pattern, split into
4303 int32_t opType; // the opcode
4304 int32_t opValue; // and the operand value.
4305
4306 #ifdef REGEX_RUN_DEBUG
4307 if (fTraceDebug) {
4308 printf("MatchAt(startIdx=%d)\n", startIdx);
4309 printf("Original Pattern: \"%s\"\n", CStr(StringFromUText(fPattern->fPattern))());
4310 printf("Input String: \"%s\"\n\n", CStr(StringFromUText(fInputText))());
4311 }
4312 #endif
4313
4314 if (U_FAILURE(status)) {
4315 return;
4316 }
4317
4318 // Cache frequently referenced items from the compiled pattern
4319 //
4320 int64_t *pat = fPattern->fCompiledPat->getBuffer();
4321
4322 const UChar *litText = fPattern->fLiteralText.getBuffer();
4323 UVector *fSets = fPattern->fSets;
4324
4325 const UChar *inputBuf = fInputText->chunkContents;
4326
4327 fFrameSize = fPattern->fFrameSize;
4328 REStackFrame *fp = resetStack();
4329 if (U_FAILURE(fDeferredStatus)) {
4330 status = fDeferredStatus;
4331 return;
4332 }
4333
4334 fp->fPatIdx = 0;
4335 fp->fInputIdx = startIdx;
4336
4337 // Zero out the pattern's static data
4338 int32_t i;
4339 for (i = 0; i<fPattern->fDataSize; i++) {
4340 fData[i] = 0;
4341 }
4342
4343 //
4344 // Main loop for interpreting the compiled pattern.
4345 // One iteration of the loop per pattern operation performed.
4346 //
4347 for (;;) {
4348 op = (int32_t)pat[fp->fPatIdx];
4349 opType = URX_TYPE(op);
4350 opValue = URX_VAL(op);
4351 #ifdef REGEX_RUN_DEBUG
4352 if (fTraceDebug) {
4353 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
4354 printf("inputIdx=%ld inputChar=%x sp=%3ld activeLimit=%ld ", fp->fInputIdx,
4355 UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(), fActiveLimit);
4356 fPattern->dumpOp(fp->fPatIdx);
4357 }
4358 #endif
4359 fp->fPatIdx++;
4360
4361 switch (opType) {
4362
4363
4364 case URX_NOP:
4365 break;
4366
4367
4368 case URX_BACKTRACK:
4369 // Force a backtrack. In some circumstances, the pattern compiler
4370 // will notice that the pattern can't possibly match anything, and will
4371 // emit one of these at that point.
4372 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4373 break;
4374
4375
4376 case URX_ONECHAR:
4377 if (fp->fInputIdx < fActiveLimit) {
4378 UChar32 c;
4379 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4380 if (c == opValue) {
4381 break;
4382 }
4383 } else {
4384 fHitEnd = TRUE;
4385 }
4386 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4387 break;
4388
4389
4390 case URX_STRING:
4391 {
4392 // Test input against a literal string.
4393 // Strings require two slots in the compiled pattern, one for the
4394 // offset to the string text, and one for the length.
4395 int32_t stringStartIdx = opValue;
4396 int32_t stringLen;
4397
4398 op = (int32_t)pat[fp->fPatIdx]; // Fetch the second operand
4399 fp->fPatIdx++;
4400 opType = URX_TYPE(op);
4401 stringLen = URX_VAL(op);
4402 U_ASSERT(opType == URX_STRING_LEN);
4403 U_ASSERT(stringLen >= 2);
4404
4405 const UChar * pInp = inputBuf + fp->fInputIdx;
4406 const UChar * pInpLimit = inputBuf + fActiveLimit;
4407 const UChar * pPat = litText+stringStartIdx;
4408 const UChar * pEnd = pInp + stringLen;
4409 UBool success = TRUE;
4410 while (pInp < pEnd) {
4411 if (pInp >= pInpLimit) {
4412 fHitEnd = TRUE;
4413 success = FALSE;
4414 break;
4415 }
4416 if (*pInp++ != *pPat++) {
4417 success = FALSE;
4418 break;
4419 }
4420 }
4421
4422 if (success) {
4423 fp->fInputIdx += stringLen;
4424 } else {
4425 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4426 }
4427 }
4428 break;
4429
4430
4431 case URX_STATE_SAVE:
4432 fp = StateSave(fp, opValue, status);
4433 break;
4434
4435
4436 case URX_END:
4437 // The match loop will exit via this path on a successful match,
4438 // when we reach the end of the pattern.
4439 if (toEnd && fp->fInputIdx != fActiveLimit) {
4440 // The pattern matched, but not to the end of input. Try some more.
4441 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4442 break;
4443 }
4444 isMatch = TRUE;
4445 goto breakFromLoop;
4446
4447 // Start and End Capture stack frame variables are laid out out like this:
4448 // fp->fExtra[opValue] - The start of a completed capture group
4449 // opValue+1 - The end of a completed capture group
4450 // opValue+2 - the start of a capture group whose end
4451 // has not yet been reached (and might not ever be).
4452 case URX_START_CAPTURE:
4453 U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
4454 fp->fExtra[opValue+2] = fp->fInputIdx;
4455 break;
4456
4457
4458 case URX_END_CAPTURE:
4459 U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
4460 U_ASSERT(fp->fExtra[opValue+2] >= 0); // Start pos for this group must be set.
4461 fp->fExtra[opValue] = fp->fExtra[opValue+2]; // Tentative start becomes real.
4462 fp->fExtra[opValue+1] = fp->fInputIdx; // End position
4463 U_ASSERT(fp->fExtra[opValue] <= fp->fExtra[opValue+1]);
4464 break;
4465
4466
4467 case URX_DOLLAR: // $, test for End of line
4468 // or for position before new line at end of input
4469 if (fp->fInputIdx < fAnchorLimit-2) {
4470 // We are no where near the end of input. Fail.
4471 // This is the common case. Keep it first.
4472 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4473 break;
4474 }
4475 if (fp->fInputIdx >= fAnchorLimit) {
4476 // We really are at the end of input. Success.
4477 fHitEnd = TRUE;
4478 fRequireEnd = TRUE;
4479 break;
4480 }
4481
4482 // If we are positioned just before a new-line that is located at the
4483 // end of input, succeed.
4484 if (fp->fInputIdx == fAnchorLimit-1) {
4485 UChar32 c;
4486 U16_GET(inputBuf, fAnchorStart, fp->fInputIdx, fAnchorLimit, c);
4487
4488 if (isLineTerminator(c)) {
4489 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp->fInputIdx-1]==0x0d)) {
4490 // At new-line at end of input. Success
4491 fHitEnd = TRUE;
4492 fRequireEnd = TRUE;
4493 break;
4494 }
4495 }
4496 } else if (fp->fInputIdx == fAnchorLimit-2 &&
4497 inputBuf[fp->fInputIdx]==0x0d && inputBuf[fp->fInputIdx+1]==0x0a) {
4498 fHitEnd = TRUE;
4499 fRequireEnd = TRUE;
4500 break; // At CR/LF at end of input. Success
4501 }
4502
4503 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4504
4505 break;
4506
4507
4508 case URX_DOLLAR_D: // $, test for End of Line, in UNIX_LINES mode.
4509 if (fp->fInputIdx >= fAnchorLimit-1) {
4510 // Either at the last character of input, or off the end.
4511 if (fp->fInputIdx == fAnchorLimit-1) {
4512 // At last char of input. Success if it's a new line.
4513 if (inputBuf[fp->fInputIdx] == 0x0a) {
4514 fHitEnd = TRUE;
4515 fRequireEnd = TRUE;
4516 break;
4517 }
4518 } else {
4519 // Off the end of input. Success.
4520 fHitEnd = TRUE;
4521 fRequireEnd = TRUE;
4522 break;
4523 }
4524 }
4525
4526 // Not at end of input. Back-track out.
4527 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4528 break;
4529
4530
4531 case URX_DOLLAR_M: // $, test for End of line in multi-line mode
4532 {
4533 if (fp->fInputIdx >= fAnchorLimit) {
4534 // We really are at the end of input. Success.
4535 fHitEnd = TRUE;
4536 fRequireEnd = TRUE;
4537 break;
4538 }
4539 // If we are positioned just before a new-line, succeed.
4540 // It makes no difference where the new-line is within the input.
4541 UChar32 c = inputBuf[fp->fInputIdx];
4542 if (isLineTerminator(c)) {
4543 // At a line end, except for the odd chance of being in the middle of a CR/LF sequence
4544 // In multi-line mode, hitting a new-line just before the end of input does not
4545 // set the hitEnd or requireEnd flags
4546 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp->fInputIdx-1]==0x0d)) {
4547 break;
4548 }
4549 }
4550 // not at a new line. Fail.
4551 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4552 }
4553 break;
4554
4555
4556 case URX_DOLLAR_MD: // $, test for End of line in multi-line and UNIX_LINES mode
4557 {
4558 if (fp->fInputIdx >= fAnchorLimit) {
4559 // We really are at the end of input. Success.
4560 fHitEnd = TRUE;
4561 fRequireEnd = TRUE; // Java set requireEnd in this case, even though
4562 break; // adding a new-line would not lose the match.
4563 }
4564 // If we are not positioned just before a new-line, the test fails; backtrack out.
4565 // It makes no difference where the new-line is within the input.
4566 if (inputBuf[fp->fInputIdx] != 0x0a) {
4567 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4568 }
4569 }
4570 break;
4571
4572
4573 case URX_CARET: // ^, test for start of line
4574 if (fp->fInputIdx != fAnchorStart) {
4575 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4576 }
4577 break;
4578
4579
4580 case URX_CARET_M: // ^, test for start of line in mulit-line mode
4581 {
4582 if (fp->fInputIdx == fAnchorStart) {
4583 // We are at the start input. Success.
4584 break;
4585 }
4586 // Check whether character just before the current pos is a new-line
4587 // unless we are at the end of input
4588 UChar c = inputBuf[fp->fInputIdx - 1];
4589 if ((fp->fInputIdx < fAnchorLimit) &&
4590 isLineTerminator(c)) {
4591 // It's a new-line. ^ is true. Success.
4592 // TODO: what should be done with positions between a CR and LF?
4593 break;
4594 }
4595 // Not at the start of a line. Fail.
4596 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4597 }
4598 break;
4599
4600
4601 case URX_CARET_M_UNIX: // ^, test for start of line in mulit-line + Unix-line mode
4602 {
4603 U_ASSERT(fp->fInputIdx >= fAnchorStart);
4604 if (fp->fInputIdx <= fAnchorStart) {
4605 // We are at the start input. Success.
4606 break;
4607 }
4608 // Check whether character just before the current pos is a new-line
4609 U_ASSERT(fp->fInputIdx <= fAnchorLimit);
4610 UChar c = inputBuf[fp->fInputIdx - 1];
4611 if (c != 0x0a) {
4612 // Not at the start of a line. Back-track out.
4613 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4614 }
4615 }
4616 break;
4617
4618 case URX_BACKSLASH_B: // Test for word boundaries
4619 {
4620 UBool success = isChunkWordBoundary((int32_t)fp->fInputIdx);
4621 success ^= (UBool)(opValue != 0); // flip sense for \B
4622 if (!success) {
4623 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4624 }
4625 }
4626 break;
4627
4628
4629 case URX_BACKSLASH_BU: // Test for word boundaries, Unicode-style
4630 {
4631 UBool success = isUWordBoundary(fp->fInputIdx, status);
4632 success ^= (UBool)(opValue != 0); // flip sense for \B
4633 if (!success) {
4634 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4635 }
4636 }
4637 break;
4638
4639
4640 case URX_BACKSLASH_D: // Test for decimal digit
4641 {
4642 if (fp->fInputIdx >= fActiveLimit) {
4643 fHitEnd = TRUE;
4644 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4645 break;
4646 }
4647
4648 UChar32 c;
4649 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4650 int8_t ctype = u_charType(c); // TODO: make a unicode set for this. Will be faster.
4651 UBool success = (ctype == U_DECIMAL_DIGIT_NUMBER);
4652 success ^= (UBool)(opValue != 0); // flip sense for \D
4653 if (!success) {
4654 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4655 }
4656 }
4657 break;
4658
4659
4660 case URX_BACKSLASH_G: // Test for position at end of previous match
4661 if (!((fMatch && fp->fInputIdx==fMatchEnd) || (fMatch==FALSE && fp->fInputIdx==fActiveStart))) {
4662 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4663 }
4664 break;
4665
4666
4667 case URX_BACKSLASH_H: // Test for \h, horizontal white space.
4668 {
4669 if (fp->fInputIdx >= fActiveLimit) {
4670 fHitEnd = TRUE;
4671 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4672 break;
4673 }
4674 UChar32 c;
4675 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4676 int8_t ctype = u_charType(c);
4677 UBool success = (ctype == U_SPACE_SEPARATOR || c == 9); // SPACE_SEPARATOR || TAB
4678 success ^= (UBool)(opValue != 0); // flip sense for \H
4679 if (!success) {
4680 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4681 }
4682 }
4683 break;
4684
4685
4686 case URX_BACKSLASH_R: // Test for \R, any line break sequence.
4687 {
4688 if (fp->fInputIdx >= fActiveLimit) {
4689 fHitEnd = TRUE;
4690 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4691 break;
4692 }
4693 UChar32 c;
4694 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4695 if (isLineTerminator(c)) {
4696 if (c == 0x0d && fp->fInputIdx < fActiveLimit) {
4697 // Check for CR/LF sequence. Consume both together when found.
4698 UChar c2;
4699 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c2);
4700 if (c2 != 0x0a) {
4701 U16_PREV(inputBuf, 0, fp->fInputIdx, c2);
4702 }
4703 }
4704 } else {
4705 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4706 }
4707 }
4708 break;
4709
4710
4711 case URX_BACKSLASH_V: // Any single code point line ending.
4712 {
4713 if (fp->fInputIdx >= fActiveLimit) {
4714 fHitEnd = TRUE;
4715 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4716 break;
4717 }
4718 UChar32 c;
4719 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4720 UBool success = isLineTerminator(c);
4721 success ^= (UBool)(opValue != 0); // flip sense for \V
4722 if (!success) {
4723 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4724 }
4725 }
4726 break;
4727
4728
4729 case URX_BACKSLASH_X:
4730 // Match a Grapheme, as defined by Unicode UAX 29.
4731
4732 // Fail if at end of input
4733 if (fp->fInputIdx >= fActiveLimit) {
4734 fHitEnd = TRUE;
4735 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4736 break;
4737 }
4738
4739 fp->fInputIdx = followingGCBoundary(fp->fInputIdx, status);
4740 if (fp->fInputIdx >= fActiveLimit) {
4741 fHitEnd = TRUE;
4742 fp->fInputIdx = fActiveLimit;
4743 }
4744 break;
4745
4746
4747 case URX_BACKSLASH_Z: // Test for end of Input
4748 if (fp->fInputIdx < fAnchorLimit) {
4749 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4750 } else {
4751 fHitEnd = TRUE;
4752 fRequireEnd = TRUE;
4753 }
4754 break;
4755
4756
4757
4758 case URX_STATIC_SETREF:
4759 {
4760 // Test input character against one of the predefined sets
4761 // (Word Characters, for example)
4762 // The high bit of the op value is a flag for the match polarity.
4763 // 0: success if input char is in set.
4764 // 1: success if input char is not in set.
4765 if (fp->fInputIdx >= fActiveLimit) {
4766 fHitEnd = TRUE;
4767 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4768 break;
4769 }
4770
4771 UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET);
4772 opValue &= ~URX_NEG_SET;
4773 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
4774
4775 UChar32 c;
4776 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4777 if (c < 256) {
4778 Regex8BitSet &s8 = RegexStaticSets::gStaticSets->fPropSets8[opValue];
4779 if (s8.contains(c)) {
4780 success = !success;
4781 }
4782 } else {
4783 const UnicodeSet &s = RegexStaticSets::gStaticSets->fPropSets[opValue];
4784 if (s.contains(c)) {
4785 success = !success;
4786 }
4787 }
4788 if (!success) {
4789 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4790 }
4791 }
4792 break;
4793
4794
4795 case URX_STAT_SETREF_N:
4796 {
4797 // Test input character for NOT being a member of one of
4798 // the predefined sets (Word Characters, for example)
4799 if (fp->fInputIdx >= fActiveLimit) {
4800 fHitEnd = TRUE;
4801 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4802 break;
4803 }
4804
4805 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
4806
4807 UChar32 c;
4808 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4809 if (c < 256) {
4810 Regex8BitSet &s8 = RegexStaticSets::gStaticSets->fPropSets8[opValue];
4811 if (s8.contains(c) == FALSE) {
4812 break;
4813 }
4814 } else {
4815 const UnicodeSet &s = RegexStaticSets::gStaticSets->fPropSets[opValue];
4816 if (s.contains(c) == FALSE) {
4817 break;
4818 }
4819 }
4820 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4821 }
4822 break;
4823
4824
4825 case URX_SETREF:
4826 {
4827 if (fp->fInputIdx >= fActiveLimit) {
4828 fHitEnd = TRUE;
4829 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4830 break;
4831 }
4832
4833 U_ASSERT(opValue > 0 && opValue < fSets->size());
4834
4835 // There is input left. Pick up one char and test it for set membership.
4836 UChar32 c;
4837 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4838 if (c<256) {
4839 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
4840 if (s8->contains(c)) {
4841 // The character is in the set. A Match.
4842 break;
4843 }
4844 } else {
4845 UnicodeSet *s = (UnicodeSet *)fSets->elementAt(opValue);
4846 if (s->contains(c)) {
4847 // The character is in the set. A Match.
4848 break;
4849 }
4850 }
4851
4852 // the character wasn't in the set.
4853 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4854 }
4855 break;
4856
4857
4858 case URX_DOTANY:
4859 {
4860 // . matches anything, but stops at end-of-line.
4861 if (fp->fInputIdx >= fActiveLimit) {
4862 // At end of input. Match failed. Backtrack out.
4863 fHitEnd = TRUE;
4864 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4865 break;
4866 }
4867
4868 // There is input left. Advance over one char, unless we've hit end-of-line
4869 UChar32 c;
4870 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4871 if (isLineTerminator(c)) {
4872 // End of line in normal mode. . does not match.
4873 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4874 break;
4875 }
4876 }
4877 break;
4878
4879
4880 case URX_DOTANY_ALL:
4881 {
4882 // . in dot-matches-all (including new lines) mode
4883 if (fp->fInputIdx >= fActiveLimit) {
4884 // At end of input. Match failed. Backtrack out.
4885 fHitEnd = TRUE;
4886 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4887 break;
4888 }
4889
4890 // There is input left. Advance over one char, except if we are
4891 // at a cr/lf, advance over both of them.
4892 UChar32 c;
4893 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4894 if (c==0x0d && fp->fInputIdx < fActiveLimit) {
4895 // In the case of a CR/LF, we need to advance over both.
4896 if (inputBuf[fp->fInputIdx] == 0x0a) {
4897 U16_FWD_1(inputBuf, fp->fInputIdx, fActiveLimit);
4898 }
4899 }
4900 }
4901 break;
4902
4903
4904 case URX_DOTANY_UNIX:
4905 {
4906 // '.' operator, matches all, but stops at end-of-line.
4907 // UNIX_LINES mode, so 0x0a is the only recognized line ending.
4908 if (fp->fInputIdx >= fActiveLimit) {
4909 // At end of input. Match failed. Backtrack out.
4910 fHitEnd = TRUE;
4911 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4912 break;
4913 }
4914
4915 // There is input left. Advance over one char, unless we've hit end-of-line
4916 UChar32 c;
4917 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4918 if (c == 0x0a) {
4919 // End of line in normal mode. '.' does not match the \n
4920 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4921 }
4922 }
4923 break;
4924
4925
4926 case URX_JMP:
4927 fp->fPatIdx = opValue;
4928 break;
4929
4930 case URX_FAIL:
4931 isMatch = FALSE;
4932 goto breakFromLoop;
4933
4934 case URX_JMP_SAV:
4935 U_ASSERT(opValue < fPattern->fCompiledPat->size());
4936 fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current
4937 fp->fPatIdx = opValue; // Then JMP.
4938 break;
4939
4940 case URX_JMP_SAV_X:
4941 // This opcode is used with (x)+, when x can match a zero length string.
4942 // Same as JMP_SAV, except conditional on the match having made forward progress.
4943 // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the
4944 // data address of the input position at the start of the loop.
4945 {
4946 U_ASSERT(opValue > 0 && opValue < fPattern->fCompiledPat->size());
4947 int32_t stoOp = (int32_t)pat[opValue-1];
4948 U_ASSERT(URX_TYPE(stoOp) == URX_STO_INP_LOC);
4949 int32_t frameLoc = URX_VAL(stoOp);
4950 U_ASSERT(frameLoc >= 0 && frameLoc < fFrameSize);
4951 int32_t prevInputIdx = (int32_t)fp->fExtra[frameLoc];
4952 U_ASSERT(prevInputIdx <= fp->fInputIdx);
4953 if (prevInputIdx < fp->fInputIdx) {
4954 // The match did make progress. Repeat the loop.
4955 fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current
4956 fp->fPatIdx = opValue;
4957 fp->fExtra[frameLoc] = fp->fInputIdx;
4958 }
4959 // If the input position did not advance, we do nothing here,
4960 // execution will fall out of the loop.
4961 }
4962 break;
4963
4964 case URX_CTR_INIT:
4965 {
4966 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
4967 fp->fExtra[opValue] = 0; // Set the loop counter variable to zero
4968
4969 // Pick up the three extra operands that CTR_INIT has, and
4970 // skip the pattern location counter past
4971 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
4972 fp->fPatIdx += 3;
4973 int32_t loopLoc = URX_VAL(pat[instrOperandLoc]);
4974 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
4975 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
4976 U_ASSERT(minCount>=0);
4977 U_ASSERT(maxCount>=minCount || maxCount==-1);
4978 U_ASSERT(loopLoc>=fp->fPatIdx);
4979
4980 if (minCount == 0) {
4981 fp = StateSave(fp, loopLoc+1, status);
4982 }
4983 if (maxCount == -1) {
4984 fp->fExtra[opValue+1] = fp->fInputIdx; // For loop breaking.
4985 } else if (maxCount == 0) {
4986 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4987 }
4988 }
4989 break;
4990
4991 case URX_CTR_LOOP:
4992 {
4993 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
4994 int32_t initOp = (int32_t)pat[opValue];
4995 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT);
4996 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
4997 int32_t minCount = (int32_t)pat[opValue+2];
4998 int32_t maxCount = (int32_t)pat[opValue+3];
4999 (*pCounter)++;
5000 if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) {
5001 U_ASSERT(*pCounter == maxCount);
5002 break;
5003 }
5004 if (*pCounter >= minCount) {
5005 if (maxCount == -1) {
5006 // Loop has no hard upper bound.
5007 // Check that it is progressing through the input, break if it is not.
5008 int64_t *pLastInputIdx = &fp->fExtra[URX_VAL(initOp) + 1];
5009 if (fp->fInputIdx == *pLastInputIdx) {
5010 break;
5011 } else {
5012 *pLastInputIdx = fp->fInputIdx;
5013 }
5014 }
5015 fp = StateSave(fp, fp->fPatIdx, status);
5016 } else {
5017 // Increment time-out counter. (StateSave() does it if count >= minCount)
5018 fTickCounter--;
5019 if (fTickCounter <= 0) {
5020 IncrementTime(status); // Re-initializes fTickCounter
5021 }
5022 }
5023 fp->fPatIdx = opValue + 4; // Loop back.
5024 }
5025 break;
5026
5027 case URX_CTR_INIT_NG:
5028 {
5029 // Initialize a non-greedy loop
5030 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
5031 fp->fExtra[opValue] = 0; // Set the loop counter variable to zero
5032
5033 // Pick up the three extra operands that CTR_INIT_NG has, and
5034 // skip the pattern location counter past
5035 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
5036 fp->fPatIdx += 3;
5037 int32_t loopLoc = URX_VAL(pat[instrOperandLoc]);
5038 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
5039 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
5040 U_ASSERT(minCount>=0);
5041 U_ASSERT(maxCount>=minCount || maxCount==-1);
5042 U_ASSERT(loopLoc>fp->fPatIdx);
5043 if (maxCount == -1) {
5044 fp->fExtra[opValue+1] = fp->fInputIdx; // Save initial input index for loop breaking.
5045 }
5046
5047 if (minCount == 0) {
5048 if (maxCount != 0) {
5049 fp = StateSave(fp, fp->fPatIdx, status);
5050 }
5051 fp->fPatIdx = loopLoc+1; // Continue with stuff after repeated block
5052 }
5053 }
5054 break;
5055
5056 case URX_CTR_LOOP_NG:
5057 {
5058 // Non-greedy {min, max} loops
5059 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
5060 int32_t initOp = (int32_t)pat[opValue];
5061 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT_NG);
5062 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
5063 int32_t minCount = (int32_t)pat[opValue+2];
5064 int32_t maxCount = (int32_t)pat[opValue+3];
5065
5066 (*pCounter)++;
5067 if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) {
5068 // The loop has matched the maximum permitted number of times.
5069 // Break out of here with no action. Matching will
5070 // continue with the following pattern.
5071 U_ASSERT(*pCounter == maxCount);
5072 break;
5073 }
5074
5075 if (*pCounter < minCount) {
5076 // We haven't met the minimum number of matches yet.
5077 // Loop back for another one.
5078 fp->fPatIdx = opValue + 4; // Loop back.
5079 fTickCounter--;
5080 if (fTickCounter <= 0) {
5081 IncrementTime(status); // Re-initializes fTickCounter
5082 }
5083 } else {
5084 // We do have the minimum number of matches.
5085
5086 // If there is no upper bound on the loop iterations, check that the input index
5087 // is progressing, and stop the loop if it is not.
5088 if (maxCount == -1) {
5089 int64_t *pLastInputIdx = &fp->fExtra[URX_VAL(initOp) + 1];
5090 if (fp->fInputIdx == *pLastInputIdx) {
5091 break;
5092 }
5093 *pLastInputIdx = fp->fInputIdx;
5094 }
5095
5096 // Loop Continuation: we will fall into the pattern following the loop
5097 // (non-greedy, don't execute loop body first), but first do
5098 // a state save to the top of the loop, so that a match failure
5099 // in the following pattern will try another iteration of the loop.
5100 fp = StateSave(fp, opValue + 4, status);
5101 }
5102 }
5103 break;
5104
5105 case URX_STO_SP:
5106 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
5107 fData[opValue] = fStack->size();
5108 break;
5109
5110 case URX_LD_SP:
5111 {
5112 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
5113 int32_t newStackSize = (int32_t)fData[opValue];
5114 U_ASSERT(newStackSize <= fStack->size());
5115 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
5116 if (newFP == (int64_t *)fp) {
5117 break;
5118 }
5119 int32_t j;
5120 for (j=0; j<fFrameSize; j++) {
5121 newFP[j] = ((int64_t *)fp)[j];
5122 }
5123 fp = (REStackFrame *)newFP;
5124 fStack->setSize(newStackSize);
5125 }
5126 break;
5127
5128 case URX_BACKREF:
5129 {
5130 U_ASSERT(opValue < fFrameSize);
5131 int64_t groupStartIdx = fp->fExtra[opValue];
5132 int64_t groupEndIdx = fp->fExtra[opValue+1];
5133 U_ASSERT(groupStartIdx <= groupEndIdx);
5134 int64_t inputIndex = fp->fInputIdx;
5135 if (groupStartIdx < 0) {
5136 // This capture group has not participated in the match thus far,
5137 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match.
5138 break;
5139 }
5140 UBool success = TRUE;
5141 for (int64_t groupIndex = groupStartIdx; groupIndex < groupEndIdx; ++groupIndex,++inputIndex) {
5142 if (inputIndex >= fActiveLimit) {
5143 success = FALSE;
5144 fHitEnd = TRUE;
5145 break;
5146 }
5147 if (inputBuf[groupIndex] != inputBuf[inputIndex]) {
5148 success = FALSE;
5149 break;
5150 }
5151 }
5152 if (success && groupStartIdx < groupEndIdx && U16_IS_LEAD(inputBuf[groupEndIdx-1]) &&
5153 inputIndex < fActiveLimit && U16_IS_TRAIL(inputBuf[inputIndex])) {
5154 // Capture group ended with an unpaired lead surrogate.
5155 // Back reference is not permitted to match lead only of a surrogatge pair.
5156 success = FALSE;
5157 }
5158 if (success) {
5159 fp->fInputIdx = inputIndex;
5160 } else {
5161 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5162 }
5163 }
5164 break;
5165
5166 case URX_BACKREF_I:
5167 {
5168 U_ASSERT(opValue < fFrameSize);
5169 int64_t groupStartIdx = fp->fExtra[opValue];
5170 int64_t groupEndIdx = fp->fExtra[opValue+1];
5171 U_ASSERT(groupStartIdx <= groupEndIdx);
5172 if (groupStartIdx < 0) {
5173 // This capture group has not participated in the match thus far,
5174 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match.
5175 break;
5176 }
5177 CaseFoldingUCharIterator captureGroupItr(inputBuf, groupStartIdx, groupEndIdx);
5178 CaseFoldingUCharIterator inputItr(inputBuf, fp->fInputIdx, fActiveLimit);
5179
5180 // Note: if the capture group match was of an empty string the backref
5181 // match succeeds. Verified by testing: Perl matches succeed
5182 // in this case, so we do too.
5183
5184 UBool success = TRUE;
5185 for (;;) {
5186 UChar32 captureGroupChar = captureGroupItr.next();
5187 if (captureGroupChar == U_SENTINEL) {
5188 success = TRUE;
5189 break;
5190 }
5191 UChar32 inputChar = inputItr.next();
5192 if (inputChar == U_SENTINEL) {
5193 success = FALSE;
5194 fHitEnd = TRUE;
5195 break;
5196 }
5197 if (inputChar != captureGroupChar) {
5198 success = FALSE;
5199 break;
5200 }
5201 }
5202
5203 if (success && inputItr.inExpansion()) {
5204 // We obtained a match by consuming part of a string obtained from
5205 // case-folding a single code point of the input text.
5206 // This does not count as an overall match.
5207 success = FALSE;
5208 }
5209
5210 if (success) {
5211 fp->fInputIdx = inputItr.getIndex();
5212 } else {
5213 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5214 }
5215 }
5216 break;
5217
5218 case URX_STO_INP_LOC:
5219 {
5220 U_ASSERT(opValue >= 0 && opValue < fFrameSize);
5221 fp->fExtra[opValue] = fp->fInputIdx;
5222 }
5223 break;
5224
5225 case URX_JMPX:
5226 {
5227 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
5228 fp->fPatIdx += 1;
5229 int32_t dataLoc = URX_VAL(pat[instrOperandLoc]);
5230 U_ASSERT(dataLoc >= 0 && dataLoc < fFrameSize);
5231 int32_t savedInputIdx = (int32_t)fp->fExtra[dataLoc];
5232 U_ASSERT(savedInputIdx <= fp->fInputIdx);
5233 if (savedInputIdx < fp->fInputIdx) {
5234 fp->fPatIdx = opValue; // JMP
5235 } else {
5236 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no progress in loop.
5237 }
5238 }
5239 break;
5240
5241 case URX_LA_START:
5242 {
5243 // Entering a look around block.
5244 // Save Stack Ptr, Input Pos.
5245 U_ASSERT(opValue>=0 && opValue+3<fPattern->fDataSize);
5246 fData[opValue] = fStack->size();
5247 fData[opValue+1] = fp->fInputIdx;
5248 fData[opValue+2] = fActiveStart;
5249 fData[opValue+3] = fActiveLimit;
5250 fActiveStart = fLookStart; // Set the match region change for
5251 fActiveLimit = fLookLimit; // transparent bounds.
5252 }
5253 break;
5254
5255 case URX_LA_END:
5256 {
5257 // Leaving a look around block.
5258 // restore Stack Ptr, Input Pos to positions they had on entry to block.
5259 U_ASSERT(opValue>=0 && opValue+3<fPattern->fDataSize);
5260 int32_t stackSize = fStack->size();
5261 int32_t newStackSize = (int32_t)fData[opValue];
5262 U_ASSERT(stackSize >= newStackSize);
5263 if (stackSize > newStackSize) {
5264 // Copy the current top frame back to the new (cut back) top frame.
5265 // This makes the capture groups from within the look-ahead
5266 // expression available.
5267 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
5268 int32_t j;
5269 for (j=0; j<fFrameSize; j++) {
5270 newFP[j] = ((int64_t *)fp)[j];
5271 }
5272 fp = (REStackFrame *)newFP;
5273 fStack->setSize(newStackSize);
5274 }
5275 fp->fInputIdx = fData[opValue+1];
5276
5277 // Restore the active region bounds in the input string; they may have
5278 // been changed because of transparent bounds on a Region.
5279 fActiveStart = fData[opValue+2];
5280 fActiveLimit = fData[opValue+3];
5281 U_ASSERT(fActiveStart >= 0);
5282 U_ASSERT(fActiveLimit <= fInputLength);
5283 }
5284 break;
5285
5286 case URX_ONECHAR_I:
5287 if (fp->fInputIdx < fActiveLimit) {
5288 UChar32 c;
5289 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
5290 if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) {
5291 break;
5292 }
5293 } else {
5294 fHitEnd = TRUE;
5295 }
5296 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5297 break;
5298
5299 case URX_STRING_I:
5300 // Case-insensitive test input against a literal string.
5301 // Strings require two slots in the compiled pattern, one for the
5302 // offset to the string text, and one for the length.
5303 // The compiled string has already been case folded.
5304 {
5305 const UChar *patternString = litText + opValue;
5306
5307 op = (int32_t)pat[fp->fPatIdx];
5308 fp->fPatIdx++;
5309 opType = URX_TYPE(op);
5310 opValue = URX_VAL(op);
5311 U_ASSERT(opType == URX_STRING_LEN);
5312 int32_t patternStringLen = opValue; // Length of the string from the pattern.
5313
5314 UChar32 cText;
5315 UChar32 cPattern;
5316 UBool success = TRUE;
5317 int32_t patternStringIdx = 0;
5318 CaseFoldingUCharIterator inputIterator(inputBuf, fp->fInputIdx, fActiveLimit);
5319 while (patternStringIdx < patternStringLen) {
5320 U16_NEXT(patternString, patternStringIdx, patternStringLen, cPattern);
5321 cText = inputIterator.next();
5322 if (cText != cPattern) {
5323 success = FALSE;
5324 if (cText == U_SENTINEL) {
5325 fHitEnd = TRUE;
5326 }
5327 break;
5328 }
5329 }
5330 if (inputIterator.inExpansion()) {
5331 success = FALSE;
5332 }
5333
5334 if (success) {
5335 fp->fInputIdx = inputIterator.getIndex();
5336 } else {
5337 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5338 }
5339 }
5340 break;
5341
5342 case URX_LB_START:
5343 {
5344 // Entering a look-behind block.
5345 // Save Stack Ptr, Input Pos and active input region.
5346 // TODO: implement transparent bounds. Ticket #6067
5347 U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize);
5348 fData[opValue] = fStack->size();
5349 fData[opValue+1] = fp->fInputIdx;
5350 // Save input string length, then reset to pin any matches to end at
5351 // the current position.
5352 fData[opValue+2] = fActiveStart;
5353 fData[opValue+3] = fActiveLimit;
5354 fActiveStart = fRegionStart;
5355 fActiveLimit = fp->fInputIdx;
5356 // Init the variable containing the start index for attempted matches.
5357 fData[opValue+4] = -1;
5358 }
5359 break;
5360
5361
5362 case URX_LB_CONT:
5363 {
5364 // Positive Look-Behind, at top of loop checking for matches of LB expression
5365 // at all possible input starting positions.
5366
5367 // Fetch the min and max possible match lengths. They are the operands
5368 // of this op in the pattern.
5369 int32_t minML = (int32_t)pat[fp->fPatIdx++];
5370 int32_t maxML = (int32_t)pat[fp->fPatIdx++];
5371 U_ASSERT(minML <= maxML);
5372 U_ASSERT(minML >= 0);
5373
5374 // Fetch (from data) the last input index where a match was attempted.
5375 U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize);
5376 int64_t &lbStartIdx = fData[opValue+4];
5377 if (lbStartIdx < 0) {
5378 // First time through loop.
5379 lbStartIdx = fp->fInputIdx - minML;
5380 if (lbStartIdx > 0 && lbStartIdx < fInputLength) {
5381 U16_SET_CP_START(inputBuf, 0, lbStartIdx);
5382 }
5383 } else {
5384 // 2nd through nth time through the loop.
5385 // Back up start position for match by one.
5386 if (lbStartIdx == 0) {
5387 lbStartIdx--;
5388 } else {
5389 U16_BACK_1(inputBuf, 0, lbStartIdx);
5390 }
5391 }
5392
5393 if (lbStartIdx < 0 || lbStartIdx < fp->fInputIdx - maxML) {
5394 // We have tried all potential match starting points without
5395 // getting a match. Backtrack out, and out of the
5396 // Look Behind altogether.
5397 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5398 fActiveStart = fData[opValue+2];
5399 fActiveLimit = fData[opValue+3];
5400 U_ASSERT(fActiveStart >= 0);
5401 U_ASSERT(fActiveLimit <= fInputLength);
5402 break;
5403 }
5404
5405 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
5406 // (successful match will fall off the end of the loop.)
5407 fp = StateSave(fp, fp->fPatIdx-3, status);
5408 fp->fInputIdx = lbStartIdx;
5409 }
5410 break;
5411
5412 case URX_LB_END:
5413 // End of a look-behind block, after a successful match.
5414 {
5415 U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize);
5416 if (fp->fInputIdx != fActiveLimit) {
5417 // The look-behind expression matched, but the match did not
5418 // extend all the way to the point that we are looking behind from.
5419 // FAIL out of here, which will take us back to the LB_CONT, which
5420 // will retry the match starting at another position or fail
5421 // the look-behind altogether, whichever is appropriate.
5422 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5423 break;
5424 }
5425
5426 // Look-behind match is good. Restore the original input string region,
5427 // which had been truncated to pin the end of the lookbehind match to the
5428 // position being looked-behind.
5429 fActiveStart = fData[opValue+2];
5430 fActiveLimit = fData[opValue+3];
5431 U_ASSERT(fActiveStart >= 0);
5432 U_ASSERT(fActiveLimit <= fInputLength);
5433 }
5434 break;
5435
5436
5437 case URX_LBN_CONT:
5438 {
5439 // Negative Look-Behind, at top of loop checking for matches of LB expression
5440 // at all possible input starting positions.
5441
5442 // Fetch the extra parameters of this op.
5443 int32_t minML = (int32_t)pat[fp->fPatIdx++];
5444 int32_t maxML = (int32_t)pat[fp->fPatIdx++];
5445 int32_t continueLoc = (int32_t)pat[fp->fPatIdx++];
5446 continueLoc = URX_VAL(continueLoc);
5447 U_ASSERT(minML <= maxML);
5448 U_ASSERT(minML >= 0);
5449 U_ASSERT(continueLoc > fp->fPatIdx);
5450
5451 // Fetch (from data) the last input index where a match was attempted.
5452 U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize);
5453 int64_t &lbStartIdx = fData[opValue+4];
5454 if (lbStartIdx < 0) {
5455 // First time through loop.
5456 lbStartIdx = fp->fInputIdx - minML;
5457 if (lbStartIdx > 0 && lbStartIdx < fInputLength) {
5458 U16_SET_CP_START(inputBuf, 0, lbStartIdx);
5459 }
5460 } else {
5461 // 2nd through nth time through the loop.
5462 // Back up start position for match by one.
5463 if (lbStartIdx == 0) {
5464 lbStartIdx--; // Because U16_BACK is unsafe starting at 0.
5465 } else {
5466 U16_BACK_1(inputBuf, 0, lbStartIdx);
5467 }
5468 }
5469
5470 if (lbStartIdx < 0 || lbStartIdx < fp->fInputIdx - maxML) {
5471 // We have tried all potential match starting points without
5472 // getting a match, which means that the negative lookbehind as
5473 // a whole has succeeded. Jump forward to the continue location
5474 fActiveStart = fData[opValue+2];
5475 fActiveLimit = fData[opValue+3];
5476 U_ASSERT(fActiveStart >= 0);
5477 U_ASSERT(fActiveLimit <= fInputLength);
5478 fp->fPatIdx = continueLoc;
5479 break;
5480 }
5481
5482 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
5483 // (successful match will cause a FAIL out of the loop altogether.)
5484 fp = StateSave(fp, fp->fPatIdx-4, status);
5485 fp->fInputIdx = lbStartIdx;
5486 }
5487 break;
5488
5489 case URX_LBN_END:
5490 // End of a negative look-behind block, after a successful match.
5491 {
5492 U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize);
5493 if (fp->fInputIdx != fActiveLimit) {
5494 // The look-behind expression matched, but the match did not
5495 // extend all the way to the point that we are looking behind from.
5496 // FAIL out of here, which will take us back to the LB_CONT, which
5497 // will retry the match starting at another position or succeed
5498 // the look-behind altogether, whichever is appropriate.
5499 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5500 break;
5501 }
5502
5503 // Look-behind expression matched, which means look-behind test as
5504 // a whole Fails
5505
5506 // Restore the original input string length, which had been truncated
5507 // inorder to pin the end of the lookbehind match
5508 // to the position being looked-behind.
5509 fActiveStart = fData[opValue+2];
5510 fActiveLimit = fData[opValue+3];
5511 U_ASSERT(fActiveStart >= 0);
5512 U_ASSERT(fActiveLimit <= fInputLength);
5513
5514 // Restore original stack position, discarding any state saved
5515 // by the successful pattern match.
5516 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5517 int32_t newStackSize = (int32_t)fData[opValue];
5518 U_ASSERT(fStack->size() > newStackSize);
5519 fStack->setSize(newStackSize);
5520
5521 // FAIL, which will take control back to someplace
5522 // prior to entering the look-behind test.
5523 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5524 }
5525 break;
5526
5527
5528 case URX_LOOP_SR_I:
5529 // Loop Initialization for the optimized implementation of
5530 // [some character set]*
5531 // This op scans through all matching input.
5532 // The following LOOP_C op emulates stack unwinding if the following pattern fails.
5533 {
5534 U_ASSERT(opValue > 0 && opValue < fSets->size());
5535 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
5536 UnicodeSet *s = (UnicodeSet *)fSets->elementAt(opValue);
5537
5538 // Loop through input, until either the input is exhausted or
5539 // we reach a character that is not a member of the set.
5540 int32_t ix = (int32_t)fp->fInputIdx;
5541 for (;;) {
5542 if (ix >= fActiveLimit) {
5543 fHitEnd = TRUE;
5544 break;
5545 }
5546 UChar32 c;
5547 U16_NEXT(inputBuf, ix, fActiveLimit, c);
5548 if (c<256) {
5549 if (s8->contains(c) == FALSE) {
5550 U16_BACK_1(inputBuf, 0, ix);
5551 break;
5552 }
5553 } else {
5554 if (s->contains(c) == FALSE) {
5555 U16_BACK_1(inputBuf, 0, ix);
5556 break;
5557 }
5558 }
5559 }
5560
5561 // If there were no matching characters, skip over the loop altogether.
5562 // The loop doesn't run at all, a * op always succeeds.
5563 if (ix == fp->fInputIdx) {
5564 fp->fPatIdx++; // skip the URX_LOOP_C op.
5565 break;
5566 }
5567
5568 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
5569 // must follow. It's operand is the stack location
5570 // that holds the starting input index for the match of this [set]*
5571 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
5572 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
5573 int32_t stackLoc = URX_VAL(loopcOp);
5574 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
5575 fp->fExtra[stackLoc] = fp->fInputIdx;
5576 fp->fInputIdx = ix;
5577
5578 // Save State to the URX_LOOP_C op that follows this one,
5579 // so that match failures in the following code will return to there.
5580 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
5581 fp = StateSave(fp, fp->fPatIdx, status);
5582 fp->fPatIdx++;
5583 }
5584 break;
5585
5586
5587 case URX_LOOP_DOT_I:
5588 // Loop Initialization for the optimized implementation of .*
5589 // This op scans through all remaining input.
5590 // The following LOOP_C op emulates stack unwinding if the following pattern fails.
5591 {
5592 // Loop through input until the input is exhausted (we reach an end-of-line)
5593 // In DOTALL mode, we can just go straight to the end of the input.
5594 int32_t ix;
5595 if ((opValue & 1) == 1) {
5596 // Dot-matches-All mode. Jump straight to the end of the string.
5597 ix = (int32_t)fActiveLimit;
5598 fHitEnd = TRUE;
5599 } else {
5600 // NOT DOT ALL mode. Line endings do not match '.'
5601 // Scan forward until a line ending or end of input.
5602 ix = (int32_t)fp->fInputIdx;
5603 for (;;) {
5604 if (ix >= fActiveLimit) {
5605 fHitEnd = TRUE;
5606 break;
5607 }
5608 UChar32 c;
5609 U16_NEXT(inputBuf, ix, fActiveLimit, c); // c = inputBuf[ix++]
5610 if ((c & 0x7f) <= 0x29) { // Fast filter of non-new-line-s
5611 if ((c == 0x0a) || // 0x0a is newline in both modes.
5612 (((opValue & 2) == 0) && // IF not UNIX_LINES mode
5613 isLineTerminator(c))) {
5614 // char is a line ending. Put the input pos back to the
5615 // line ending char, and exit the scanning loop.
5616 U16_BACK_1(inputBuf, 0, ix);
5617 break;
5618 }
5619 }
5620 }
5621 }
5622
5623 // If there were no matching characters, skip over the loop altogether.
5624 // The loop doesn't run at all, a * op always succeeds.
5625 if (ix == fp->fInputIdx) {
5626 fp->fPatIdx++; // skip the URX_LOOP_C op.
5627 break;
5628 }
5629
5630 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
5631 // must follow. It's operand is the stack location
5632 // that holds the starting input index for the match of this .*
5633 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
5634 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
5635 int32_t stackLoc = URX_VAL(loopcOp);
5636 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
5637 fp->fExtra[stackLoc] = fp->fInputIdx;
5638 fp->fInputIdx = ix;
5639
5640 // Save State to the URX_LOOP_C op that follows this one,
5641 // so that match failures in the following code will return to there.
5642 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
5643 fp = StateSave(fp, fp->fPatIdx, status);
5644 fp->fPatIdx++;
5645 }
5646 break;
5647
5648
5649 case URX_LOOP_C:
5650 {
5651 U_ASSERT(opValue>=0 && opValue<fFrameSize);
5652 backSearchIndex = (int32_t)fp->fExtra[opValue];
5653 U_ASSERT(backSearchIndex <= fp->fInputIdx);
5654 if (backSearchIndex == fp->fInputIdx) {
5655 // We've backed up the input idx to the point that the loop started.
5656 // The loop is done. Leave here without saving state.
5657 // Subsequent failures won't come back here.
5658 break;
5659 }
5660 // Set up for the next iteration of the loop, with input index
5661 // backed up by one from the last time through,
5662 // and a state save to this instruction in case the following code fails again.
5663 // (We're going backwards because this loop emulates stack unwinding, not
5664 // the initial scan forward.)
5665 U_ASSERT(fp->fInputIdx > 0);
5666 UChar32 prevC;
5667 U16_PREV(inputBuf, 0, fp->fInputIdx, prevC); // !!!: should this 0 be one of f*Limit?
5668
5669 if (prevC == 0x0a &&
5670 fp->fInputIdx > backSearchIndex &&
5671 inputBuf[fp->fInputIdx-1] == 0x0d) {
5672 int32_t prevOp = (int32_t)pat[fp->fPatIdx-2];
5673 if (URX_TYPE(prevOp) == URX_LOOP_DOT_I) {
5674 // .*, stepping back over CRLF pair.
5675 U16_BACK_1(inputBuf, 0, fp->fInputIdx);
5676 }
5677 }
5678
5679
5680 fp = StateSave(fp, fp->fPatIdx-1, status);
5681 }
5682 break;
5683
5684
5685
5686 default:
5687 // Trouble. The compiled pattern contains an entry with an
5688 // unrecognized type tag.
5689 UPRV_UNREACHABLE_ASSERT;
5690 // Unknown opcode type in opType = URX_TYPE(pat[fp->fPatIdx]). But we have
5691 // reports of this in production code, don't use UPRV_UNREACHABLE_EXIT.
5692 // See ICU-21669.
5693 status = U_INTERNAL_PROGRAM_ERROR;
5694 }
5695
5696 if (U_FAILURE(status)) {
5697 isMatch = FALSE;
5698 break;
5699 }
5700 }
5701
5702 breakFromLoop:
5703 fMatch = isMatch;
5704 if (isMatch) {
5705 fLastMatchEnd = fMatchEnd;
5706 fMatchStart = startIdx;
5707 fMatchEnd = fp->fInputIdx;
5708 }
5709
5710 #ifdef REGEX_RUN_DEBUG
5711 if (fTraceDebug) {
5712 if (isMatch) {
5713 printf("Match. start=%ld end=%ld\n\n", fMatchStart, fMatchEnd);
5714 } else {
5715 printf("No match\n\n");
5716 }
5717 }
5718 #endif
5719
5720 fFrame = fp; // The active stack frame when the engine stopped.
5721 // Contains the capture group results that we need to
5722 // access later.
5723
5724 return;
5725 }
5726
5727
5728 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RegexMatcher)
5729
5730 U_NAMESPACE_END
5731
5732 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS
5733
5734