1 /*
2 **************************************************************************
3 * Copyright (C) 2002-2007 International Business Machines Corporation *
4 * and others. All rights reserved. *
5 **************************************************************************
6 */
7 //
8 // file: rematch.cpp
9 //
10 // Contains the implementation of class RegexMatcher,
11 // which is one of the main API classes for the ICU regular expression package.
12 //
13
14 #include "unicode/utypes.h"
15 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
16
17 #include "unicode/regex.h"
18 #include "unicode/uniset.h"
19 #include "unicode/uchar.h"
20 #include "unicode/ustring.h"
21 #include "unicode/rbbi.h"
22 #include "uassert.h"
23 #include "cmemory.h"
24 #include "uvector.h"
25 #include "uvectr32.h"
26 #include "regeximp.h"
27 #include "regexst.h"
28
29 // #include <malloc.h> // Needed for heapcheck testing
30
31 U_NAMESPACE_BEGIN
32
33 //-----------------------------------------------------------------------------
34 //
35 // Constructor and Destructor
36 //
37 //-----------------------------------------------------------------------------
RegexMatcher(const RegexPattern * pat)38 RegexMatcher::RegexMatcher(const RegexPattern *pat) {
39 fPattern = pat;
40 fPatternOwned = NULL;
41 fInput = NULL;
42 fTraceDebug = FALSE;
43 fDeferredStatus = U_ZERO_ERROR;
44 fStack = new UVector32(fDeferredStatus);
45 fData = fSmallData;
46 fWordBreakItr = NULL;
47 fTransparentBounds = FALSE;
48 fAnchoringBounds = TRUE;
49 if (pat==NULL) {
50 fDeferredStatus = U_ILLEGAL_ARGUMENT_ERROR;
51 return;
52 }
53 if (pat->fDataSize > (int32_t)(sizeof(fSmallData)/sizeof(int32_t))) {
54 fData = (int32_t *)uprv_malloc(pat->fDataSize * sizeof(int32_t));
55 }
56 if (fStack == NULL || fData == NULL) {
57 fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
58 }
59
60 reset(RegexStaticSets::gStaticSets->fEmptyString);
61 }
62
63
64
RegexMatcher(const UnicodeString & regexp,const UnicodeString & input,uint32_t flags,UErrorCode & status)65 RegexMatcher::RegexMatcher(const UnicodeString ®exp, const UnicodeString &input,
66 uint32_t flags, UErrorCode &status) {
67 UParseError pe;
68 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status);
69 fPattern = fPatternOwned;
70 fTraceDebug = FALSE;
71 fDeferredStatus = U_ZERO_ERROR;
72 fStack = new UVector32(status);
73 fData = fSmallData;
74 fWordBreakItr = NULL;
75 fTransparentBounds = FALSE;
76 fAnchoringBounds = TRUE;
77 if (U_FAILURE(status)) {
78 return;
79 }
80 if (fPattern->fDataSize > (int32_t)(sizeof(fSmallData)/sizeof(int32_t))) {
81 fData = (int32_t *)uprv_malloc(fPattern->fDataSize * sizeof(int32_t));
82 }
83 if (fStack == NULL || fData == NULL) {
84 status = U_MEMORY_ALLOCATION_ERROR;
85 }
86 reset(input);
87 }
88
89
RegexMatcher(const UnicodeString & regexp,uint32_t flags,UErrorCode & status)90 RegexMatcher::RegexMatcher(const UnicodeString ®exp,
91 uint32_t flags, UErrorCode &status) {
92 UParseError pe;
93 fTraceDebug = FALSE;
94 fDeferredStatus = U_ZERO_ERROR;
95 fStack = new UVector32(status);
96 fData = fSmallData;
97 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status);
98 fPattern = fPatternOwned;
99 fWordBreakItr = NULL;
100 fTransparentBounds = FALSE;
101 fAnchoringBounds = TRUE;
102 if (U_FAILURE(status)) {
103 return;
104 }
105
106 if (fPattern->fDataSize > (int32_t)(sizeof(fSmallData)/sizeof(int32_t))) {
107 fData = (int32_t *)uprv_malloc(fPattern->fDataSize * sizeof(int32_t));
108 }
109 if (fStack == NULL || fData == NULL) {
110 status = U_MEMORY_ALLOCATION_ERROR;
111 }
112 reset(RegexStaticSets::gStaticSets->fEmptyString);
113 }
114
115
116
~RegexMatcher()117 RegexMatcher::~RegexMatcher() {
118 delete fStack;
119 if (fData != fSmallData) {
120 uprv_free(fData);
121 fData = NULL;
122 }
123 if (fPatternOwned) {
124 delete fPatternOwned;
125 fPatternOwned = NULL;
126 fPattern = NULL;
127 }
128 #if UCONFIG_NO_BREAK_ITERATION==0
129 delete fWordBreakItr;
130 #endif
131 }
132
133
134
135 static const UChar BACKSLASH = 0x5c;
136 static const UChar DOLLARSIGN = 0x24;
137 //--------------------------------------------------------------------------------
138 //
139 // appendReplacement
140 //
141 //--------------------------------------------------------------------------------
appendReplacement(UnicodeString & dest,const UnicodeString & replacement,UErrorCode & status)142 RegexMatcher &RegexMatcher::appendReplacement(UnicodeString &dest,
143 const UnicodeString &replacement,
144 UErrorCode &status) {
145 if (U_FAILURE(status)) {
146 return *this;
147 }
148 if (U_FAILURE(fDeferredStatus)) {
149 status = fDeferredStatus;
150 return *this;
151 }
152 if (fMatch == FALSE) {
153 status = U_REGEX_INVALID_STATE;
154 return *this;
155 }
156
157 // Copy input string from the end of previous match to start of current match
158 int32_t len = fMatchStart-fAppendPosition;
159 if (len > 0) {
160 dest.append(*fInput, fAppendPosition, len);
161 }
162 fAppendPosition = fMatchEnd;
163
164
165 // scan the replacement text, looking for substitutions ($n) and \escapes.
166 // TODO: optimize this loop by efficiently scanning for '$' or '\',
167 // move entire ranges not containing substitutions.
168 int32_t replLen = replacement.length();
169 int32_t replIdx = 0;
170 while (replIdx<replLen) {
171 UChar c = replacement.charAt(replIdx);
172 replIdx++;
173 if (c == BACKSLASH) {
174 // Backslash Escape. Copy the following char out without further checks.
175 // Note: Surrogate pairs don't need any special handling
176 // The second half wont be a '$' or a '\', and
177 // will move to the dest normally on the next
178 // loop iteration.
179 if (replIdx >= replLen) {
180 break;
181 }
182 c = replacement.charAt(replIdx);
183
184 if (c==0x55/*U*/ || c==0x75/*u*/) {
185 // We have a \udddd or \Udddddddd escape sequence.
186 UChar32 escapedChar = replacement.unescapeAt(replIdx);
187 if (escapedChar != (UChar32)0xFFFFFFFF) {
188 dest.append(escapedChar);
189 // TODO: Report errors for mal-formed \u escapes?
190 // As this is, the original sequence is output, which may be OK.
191 continue;
192 }
193 }
194
195 // Plain backslash escape. Just put out the escaped character.
196 dest.append(c);
197 replIdx++;
198 continue;
199 }
200
201 if (c != DOLLARSIGN) {
202 // Normal char, not a $. Copy it out without further checks.
203 dest.append(c);
204 continue;
205 }
206
207 // We've got a $. Pick up a capture group number if one follows.
208 // Consume at most the number of digits necessary for the largest capture
209 // number that is valid for this pattern.
210
211 int32_t numDigits = 0;
212 int32_t groupNum = 0;
213 UChar32 digitC;
214 for (;;) {
215 if (replIdx >= replLen) {
216 break;
217 }
218 digitC = replacement.char32At(replIdx);
219 if (u_isdigit(digitC) == FALSE) {
220 break;
221 }
222 replIdx = replacement.moveIndex32(replIdx, 1);
223 groupNum=groupNum*10 + u_charDigitValue(digitC);
224 numDigits++;
225 if (numDigits >= fPattern->fMaxCaptureDigits) {
226 break;
227 }
228 }
229
230
231 if (numDigits == 0) {
232 // The $ didn't introduce a group number at all.
233 // Treat it as just part of the substitution text.
234 dest.append(DOLLARSIGN);
235 continue;
236 }
237
238 // Finally, append the capture group data to the destination.
239 dest.append(group(groupNum, status));
240 if (U_FAILURE(status)) {
241 // Can fail if group number is out of range.
242 break;
243 }
244
245 }
246
247 return *this;
248 }
249
250
251
252 //--------------------------------------------------------------------------------
253 //
254 // appendTail Intended to be used in conjunction with appendReplacement()
255 // To the destination string, append everything following
256 // the last match position from the input string.
257 //
258 // Note: Match ranges do not affect appendTail or appendReplacement
259 //
260 //--------------------------------------------------------------------------------
appendTail(UnicodeString & dest)261 UnicodeString &RegexMatcher::appendTail(UnicodeString &dest) {
262 int32_t len = fInput->length() - fAppendPosition;
263 if (len > 0) {
264 dest.append(*fInput, fAppendPosition, len);
265 }
266 return dest;
267 }
268
269
270
271 //--------------------------------------------------------------------------------
272 //
273 // end
274 //
275 //--------------------------------------------------------------------------------
end(UErrorCode & err) const276 int32_t RegexMatcher::end(UErrorCode &err) const {
277 return end(0, err);
278 }
279
280
281
end(int32_t group,UErrorCode & err) const282 int32_t RegexMatcher::end(int32_t group, UErrorCode &err) const {
283 if (U_FAILURE(err)) {
284 return -1;
285 }
286 if (fMatch == FALSE) {
287 err = U_REGEX_INVALID_STATE;
288 return -1;
289 }
290 if (group < 0 || group > fPattern->fGroupMap->size()) {
291 err = U_INDEX_OUTOFBOUNDS_ERROR;
292 return -1;
293 }
294 int32_t e = -1;
295 if (group == 0) {
296 e = fMatchEnd;
297 } else {
298 // Get the position within the stack frame of the variables for
299 // this capture group.
300 int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1);
301 U_ASSERT(groupOffset < fPattern->fFrameSize);
302 U_ASSERT(groupOffset >= 0);
303 e = fFrame->fExtra[groupOffset + 1];
304 }
305 return e;
306 }
307
308
309
310 //--------------------------------------------------------------------------------
311 //
312 // find()
313 //
314 //--------------------------------------------------------------------------------
find()315 UBool RegexMatcher::find() {
316 // Start at the position of the last match end. (Will be zero if the
317 // matcher has been reset.
318 //
319 if (U_FAILURE(fDeferredStatus)) {
320 return FALSE;
321 }
322
323 int32_t startPos = fMatchEnd;
324 if (startPos==0) {
325 startPos = fActiveStart;
326 }
327
328 if (fMatch) {
329 // Save the position of any previous successful match.
330 fLastMatchEnd = fMatchEnd;
331
332 if (fMatchStart == fMatchEnd) {
333 // Previous match had zero length. Move start position up one position
334 // to avoid sending find() into a loop on zero-length matches.
335 if (startPos >= fActiveLimit) {
336 fMatch = FALSE;
337 fHitEnd = TRUE;
338 return FALSE;
339 }
340 startPos = fInput->moveIndex32(startPos, 1);
341 }
342 } else {
343 if (fLastMatchEnd >= 0) {
344 // A previous find() failed to match. Don't try again.
345 // (without this test, a pattern with a zero-length match
346 // could match again at the end of an input string.)
347 fHitEnd = TRUE;
348 return FALSE;
349 }
350 }
351
352
353 // Compute the position in the input string beyond which a match can not begin, because
354 // the minimum length match would extend past the end of the input.
355 // Note: some patterns that cannot match anything will have fMinMatchLength==Max Int.
356 // Be aware of possible overflows if making changes here.
357 int32_t testLen = fActiveLimit - fPattern->fMinMatchLen;
358 if (startPos > testLen) {
359 fMatch = FALSE;
360 fHitEnd = TRUE;
361 return FALSE;
362 }
363
364 const UChar *inputBuf = fInput->getBuffer();
365 UChar32 c;
366 U_ASSERT(startPos >= 0);
367
368 switch (fPattern->fStartType) {
369 case START_NO_INFO:
370 // No optimization was found.
371 // Try a match at each input position.
372 for (;;) {
373 MatchAt(startPos, FALSE, fDeferredStatus);
374 if (U_FAILURE(fDeferredStatus)) {
375 return FALSE;
376 }
377 if (fMatch) {
378 return TRUE;
379 }
380 if (startPos >= testLen) {
381 fHitEnd = TRUE;
382 return FALSE;
383 }
384 U16_FWD_1(inputBuf, startPos, fActiveLimit);
385 // Note that it's perfectly OK for a pattern to have a zero-length
386 // match at the end of a string, so we must make sure that the loop
387 // runs with startPos == testLen the last time through.
388 }
389 U_ASSERT(FALSE);
390
391 case START_START:
392 // Matches are only possible at the start of the input string
393 // (pattern begins with ^ or \A)
394 if (startPos > fActiveStart) {
395 fMatch = FALSE;
396 return FALSE;
397 }
398 MatchAt(startPos, FALSE, fDeferredStatus);
399 if (U_FAILURE(fDeferredStatus)) {
400 return FALSE;
401 }
402 return fMatch;
403
404
405 case START_SET:
406 {
407 // Match may start on any char from a pre-computed set.
408 U_ASSERT(fPattern->fMinMatchLen > 0);
409 for (;;) {
410 int32_t pos = startPos;
411 U16_NEXT(inputBuf, startPos, fActiveLimit, c); // like c = inputBuf[startPos++];
412 if (c<256 && fPattern->fInitialChars8->contains(c) ||
413 c>=256 && fPattern->fInitialChars->contains(c)) {
414 MatchAt(pos, FALSE, fDeferredStatus);
415 if (U_FAILURE(fDeferredStatus)) {
416 return FALSE;
417 }
418 if (fMatch) {
419 return TRUE;
420 }
421 }
422 if (pos >= testLen) {
423 fMatch = FALSE;
424 fHitEnd = TRUE;
425 return FALSE;
426 }
427 }
428 }
429 U_ASSERT(FALSE);
430
431 case START_STRING:
432 case START_CHAR:
433 {
434 // Match starts on exactly one char.
435 U_ASSERT(fPattern->fMinMatchLen > 0);
436 UChar32 theChar = fPattern->fInitialChar;
437 for (;;) {
438 int32_t pos = startPos;
439 U16_NEXT(inputBuf, startPos, fActiveLimit, c); // like c = inputBuf[startPos++];
440 if (c == theChar) {
441 MatchAt(pos, FALSE, fDeferredStatus);
442 if (U_FAILURE(fDeferredStatus)) {
443 return FALSE;
444 }
445 if (fMatch) {
446 return TRUE;
447 }
448 }
449 if (pos >= testLen) {
450 fMatch = FALSE;
451 fHitEnd = TRUE;
452 return FALSE;
453 }
454 }
455 }
456 U_ASSERT(FALSE);
457
458 case START_LINE:
459 {
460 UChar32 c;
461 if (startPos == fAnchorStart) {
462 MatchAt(startPos, FALSE, fDeferredStatus);
463 if (U_FAILURE(fDeferredStatus)) {
464 return FALSE;
465 }
466 if (fMatch) {
467 return TRUE;
468 }
469 U16_NEXT(inputBuf, startPos, fActiveLimit, c); // like c = inputBuf[startPos++];
470 }
471
472 if (fPattern->fFlags & UREGEX_UNIX_LINES) {
473 for (;;) {
474 c = inputBuf[startPos-1];
475 if (c == 0x0a) {
476 MatchAt(startPos, FALSE, fDeferredStatus);
477 if (U_FAILURE(fDeferredStatus)) {
478 return FALSE;
479 }
480 if (fMatch) {
481 return TRUE;
482 }
483 }
484 if (startPos >= testLen) {
485 fMatch = FALSE;
486 fHitEnd = TRUE;
487 return FALSE;
488 }
489 U16_NEXT(inputBuf, startPos, fActiveLimit, c); // like c = inputBuf[startPos++];
490 // Note that it's perfectly OK for a pattern to have a zero-length
491 // match at the end of a string, so we must make sure that the loop
492 // runs with startPos == testLen the last time through.
493 }
494 } else {
495 for (;;) {
496 c = inputBuf[startPos-1];
497 if (((c & 0x7f) <= 0x29) && // First quickly bypass as many chars as possible
498 ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029 )) {
499 if (c == 0x0d && startPos < fActiveLimit && inputBuf[startPos] == 0x0a) {
500 startPos++;
501 }
502 MatchAt(startPos, FALSE, fDeferredStatus);
503 if (U_FAILURE(fDeferredStatus)) {
504 return FALSE;
505 }
506 if (fMatch) {
507 return TRUE;
508 }
509 }
510 if (startPos >= testLen) {
511 fMatch = FALSE;
512 fHitEnd = TRUE;
513 return FALSE;
514 }
515 U16_NEXT(inputBuf, startPos, fActiveLimit, c); // like c = inputBuf[startPos++];
516 // Note that it's perfectly OK for a pattern to have a zero-length
517 // match at the end of a string, so we must make sure that the loop
518 // runs with startPos == testLen the last time through.
519 }
520 }
521 }
522
523 default:
524 U_ASSERT(FALSE);
525 }
526
527 U_ASSERT(FALSE);
528 return FALSE;
529 }
530
531
532
find(int32_t start,UErrorCode & status)533 UBool RegexMatcher::find(int32_t start, UErrorCode &status) {
534 if (U_FAILURE(status)) {
535 return FALSE;
536 }
537 if (U_FAILURE(fDeferredStatus)) {
538 status = fDeferredStatus;
539 return FALSE;
540 }
541 this->reset(); // Note: Reset() is specified by Java Matcher documentation.
542 // This will reset the region to be the full input length.
543 if (start < fActiveStart || start > fActiveLimit) {
544 status = U_INDEX_OUTOFBOUNDS_ERROR;
545 return FALSE;
546 }
547 fMatchEnd = start;
548 return find();
549 }
550
551
552
553 //--------------------------------------------------------------------------------
554 //
555 // group()
556 //
557 //--------------------------------------------------------------------------------
group(UErrorCode & status) const558 UnicodeString RegexMatcher::group(UErrorCode &status) const {
559 return group(0, status);
560 }
561
562
563
group(int32_t groupNum,UErrorCode & status) const564 UnicodeString RegexMatcher::group(int32_t groupNum, UErrorCode &status) const {
565 int32_t s = start(groupNum, status);
566 int32_t e = end(groupNum, status);
567
568 // Note: calling start() and end() above will do all necessary checking that
569 // the group number is OK and that a match exists. status will be set.
570 if (U_FAILURE(status)) {
571 return UnicodeString();
572 }
573 if (U_FAILURE(fDeferredStatus)) {
574 status = fDeferredStatus;
575 return UnicodeString();
576 }
577
578 if (s < 0) {
579 // A capture group wasn't part of the match
580 return UnicodeString();
581 }
582 U_ASSERT(s <= e);
583 return UnicodeString(*fInput, s, e-s);
584 }
585
586
587
588
groupCount() const589 int32_t RegexMatcher::groupCount() const {
590 return fPattern->fGroupMap->size();
591 }
592
593
594
input() const595 const UnicodeString &RegexMatcher::input() const {
596 return *fInput;
597 }
598
599
600 //--------------------------------------------------------------------------------
601 //
602 // hasAnchoringBounds()
603 //
604 //--------------------------------------------------------------------------------
hasAnchoringBounds() const605 UBool RegexMatcher::hasAnchoringBounds() const {
606 return fAnchoringBounds;
607 }
608
609
610 //--------------------------------------------------------------------------------
611 //
612 // hasTransparentBounds()
613 //
614 //--------------------------------------------------------------------------------
hasTransparentBounds() const615 UBool RegexMatcher::hasTransparentBounds() const {
616 return fTransparentBounds;
617 }
618
619
620
621 //--------------------------------------------------------------------------------
622 //
623 // hitEnd()
624 //
625 //--------------------------------------------------------------------------------
hitEnd() const626 UBool RegexMatcher::hitEnd() const {
627 return fHitEnd;
628 }
629
630 //--------------------------------------------------------------------------------
631 //
632 // lookingAt()
633 //
634 //--------------------------------------------------------------------------------
lookingAt(UErrorCode & status)635 UBool RegexMatcher::lookingAt(UErrorCode &status) {
636 if (U_FAILURE(status)) {
637 return FALSE;
638 }
639 if (U_FAILURE(fDeferredStatus)) {
640 status = fDeferredStatus;
641 return FALSE;
642 }
643 resetPreserveRegion();
644 MatchAt(fActiveStart, FALSE, status);
645 return fMatch;
646 }
647
648
lookingAt(int32_t start,UErrorCode & status)649 UBool RegexMatcher::lookingAt(int32_t start, UErrorCode &status) {
650 if (U_FAILURE(status)) {
651 return FALSE;
652 }
653 if (U_FAILURE(fDeferredStatus)) {
654 status = fDeferredStatus;
655 return FALSE;
656 }
657 reset();
658 if (start < fActiveStart || start > fActiveLimit) {
659 status = U_INDEX_OUTOFBOUNDS_ERROR;
660 return FALSE;
661 }
662 MatchAt(start, FALSE, status);
663 return fMatch;
664 }
665
666
667
668 //--------------------------------------------------------------------------------
669 //
670 // matches()
671 //
672 //--------------------------------------------------------------------------------
matches(UErrorCode & status)673 UBool RegexMatcher::matches(UErrorCode &status) {
674 if (U_FAILURE(status)) {
675 return FALSE;
676 }
677 if (U_FAILURE(fDeferredStatus)) {
678 status = fDeferredStatus;
679 return FALSE;
680 }
681 resetPreserveRegion();
682 MatchAt(fActiveStart, TRUE, status);
683 return fMatch;
684 }
685
686
matches(int32_t start,UErrorCode & status)687 UBool RegexMatcher::matches(int32_t start, UErrorCode &status) {
688 if (U_FAILURE(status)) {
689 return FALSE;
690 }
691 if (U_FAILURE(fDeferredStatus)) {
692 status = fDeferredStatus;
693 return FALSE;
694 }
695 reset();
696 if (start < fActiveStart || start > fActiveLimit) {
697 status = U_INDEX_OUTOFBOUNDS_ERROR;
698 return FALSE;
699 }
700 MatchAt(start, TRUE, status);
701 return fMatch;
702 }
703
704
705
706 //--------------------------------------------------------------------------------
707 //
708 // pattern
709 //
710 //--------------------------------------------------------------------------------
pattern() const711 const RegexPattern &RegexMatcher::pattern() const {
712 return *fPattern;
713 }
714
715
716
717 //--------------------------------------------------------------------------------
718 //
719 // region
720 //
721 //--------------------------------------------------------------------------------
region(int32_t start,int32_t limit,UErrorCode & status)722 RegexMatcher &RegexMatcher::region(int32_t start, int32_t limit, UErrorCode &status) {
723 if (U_FAILURE(status)) {
724 return *this;
725 }
726 if (start>limit || start<0 || limit<0 || limit>fInput->length()) {
727 status = U_ILLEGAL_ARGUMENT_ERROR;
728 }
729 this->reset();
730 fRegionStart = start;
731 fRegionLimit = limit;
732 fActiveStart = start;
733 fActiveLimit = limit;
734 if (!fTransparentBounds) {
735 fLookStart = start;
736 fLookLimit = limit;
737 }
738 if (fAnchoringBounds) {
739 fAnchorStart = start;
740 fAnchorLimit = limit;
741 }
742 return *this;
743 }
744
745
746
747 //--------------------------------------------------------------------------------
748 //
749 // regionEnd
750 //
751 //--------------------------------------------------------------------------------
regionEnd() const752 int RegexMatcher::regionEnd() const {
753 return fRegionLimit;
754 }
755
756
757 //--------------------------------------------------------------------------------
758 //
759 // regionStart
760 //
761 //--------------------------------------------------------------------------------
regionStart() const762 int RegexMatcher::regionStart() const {
763 return fRegionStart;
764 }
765
766
767 //--------------------------------------------------------------------------------
768 //
769 // replaceAll
770 //
771 //--------------------------------------------------------------------------------
replaceAll(const UnicodeString & replacement,UErrorCode & status)772 UnicodeString RegexMatcher::replaceAll(const UnicodeString &replacement, UErrorCode &status) {
773 if (U_FAILURE(status)) {
774 return *fInput;
775 }
776 if (U_FAILURE(fDeferredStatus)) {
777 status = fDeferredStatus;
778 return *fInput;
779 }
780 UnicodeString destString;
781 reset();
782 while (find()) {
783 appendReplacement(destString, replacement, status);
784 if (U_FAILURE(status)) {
785 break;
786 }
787 }
788 appendTail(destString);
789 return destString;
790 }
791
792
793
794
795 //--------------------------------------------------------------------------------
796 //
797 // replaceFirst
798 //
799 //--------------------------------------------------------------------------------
replaceFirst(const UnicodeString & replacement,UErrorCode & status)800 UnicodeString RegexMatcher::replaceFirst(const UnicodeString &replacement, UErrorCode &status) {
801 if (U_FAILURE(status)) {
802 return *fInput;
803 }
804 if (U_FAILURE(fDeferredStatus)) {
805 status = fDeferredStatus;
806 return *fInput;
807 }
808
809 reset();
810 if (!find()) {
811 return *fInput;
812 }
813
814 UnicodeString destString;
815 appendReplacement(destString, replacement, status);
816 appendTail(destString);
817 return destString;
818 }
819
820
821 //--------------------------------------------------------------------------------
822 //
823 // requireEnd
824 //
825 //--------------------------------------------------------------------------------
requireEnd() const826 UBool RegexMatcher::requireEnd() const {
827 return fRequireEnd;
828 }
829
830
831 //--------------------------------------------------------------------------------
832 //
833 // reset
834 //
835 //--------------------------------------------------------------------------------
reset()836 RegexMatcher &RegexMatcher::reset() {
837 fRegionStart = 0;
838 fRegionLimit = fInput->length();
839 fActiveStart = 0;
840 fActiveLimit = fRegionLimit;
841 fAnchorStart = 0;
842 fAnchorLimit = fRegionLimit;
843 fLookStart = 0;
844 fLookLimit = fRegionLimit;
845 resetPreserveRegion();
846 return *this;
847 }
848
849
850
resetPreserveRegion()851 void RegexMatcher::resetPreserveRegion() {
852 fMatchStart = 0;
853 fMatchEnd = 0;
854 fLastMatchEnd = -1;
855 fAppendPosition = 0;
856 fMatch = FALSE;
857 fHitEnd = FALSE;
858 fRequireEnd = FALSE;
859 resetStack();
860 }
861
862
reset(const UnicodeString & input)863 RegexMatcher &RegexMatcher::reset(const UnicodeString &input) {
864 fInput = &input;
865 reset();
866 if (fWordBreakItr != NULL) {
867 #if UCONFIG_NO_BREAK_ITERATION==0
868 fWordBreakItr->setText(input);
869 #endif
870 }
871 return *this;
872 }
873
874 /*RegexMatcher &RegexMatcher::reset(const UChar *) {
875 fDeferredStatus = U_INTERNAL_PROGRAM_ERROR;
876 return *this;
877 }*/
878
879
reset(int32_t position,UErrorCode & status)880 RegexMatcher &RegexMatcher::reset(int32_t position, UErrorCode &status) {
881 if (U_FAILURE(status)) {
882 return *this;
883 }
884 reset(); // Reset also resets the region to be the entire string.
885 if (position < 0 || position >= fActiveLimit) {
886 status = U_INDEX_OUTOFBOUNDS_ERROR;
887 return *this;
888 }
889 fMatchEnd = position;
890 return *this;
891 }
892
893
894
895
896
897 //--------------------------------------------------------------------------------
898 //
899 // setTrace
900 //
901 //--------------------------------------------------------------------------------
setTrace(UBool state)902 void RegexMatcher::setTrace(UBool state) {
903 fTraceDebug = state;
904 }
905
906
907
908 //---------------------------------------------------------------------
909 //
910 // split
911 //
912 //---------------------------------------------------------------------
split(const UnicodeString & input,UnicodeString dest[],int32_t destCapacity,UErrorCode & status)913 int32_t RegexMatcher::split(const UnicodeString &input,
914 UnicodeString dest[],
915 int32_t destCapacity,
916 UErrorCode &status)
917 {
918 //
919 // Check arguements for validity
920 //
921 if (U_FAILURE(status)) {
922 return 0;
923 };
924
925 if (destCapacity < 1) {
926 status = U_ILLEGAL_ARGUMENT_ERROR;
927 return 0;
928 }
929
930 //
931 // Reset for the input text
932 //
933 reset(input);
934 int32_t nextOutputStringStart = 0;
935 if (fActiveLimit == 0) {
936 return 0;
937 }
938
939 //
940 // Loop through the input text, searching for the delimiter pattern
941 //
942 int32_t i;
943 int32_t numCaptureGroups = fPattern->fGroupMap->size();
944 for (i=0; ; i++) {
945 if (i>=destCapacity-1) {
946 // There is one or zero output string left.
947 // Fill the last output string with whatever is left from the input, then exit the loop.
948 // ( i will be == destCapicity if we filled the output array while processing
949 // capture groups of the delimiter expression, in which case we will discard the
950 // last capture group saved in favor of the unprocessed remainder of the
951 // input string.)
952 i = destCapacity-1;
953 int32_t remainingLength = fActiveLimit-nextOutputStringStart;
954 if (remainingLength > 0) {
955 dest[i].setTo(input, nextOutputStringStart, remainingLength);
956 }
957 break;
958 }
959 if (find()) {
960 // We found another delimiter. Move everything from where we started looking
961 // up until the start of the delimiter into the next output string.
962 int32_t fieldLen = fMatchStart - nextOutputStringStart;
963 dest[i].setTo(input, nextOutputStringStart, fieldLen);
964 nextOutputStringStart = fMatchEnd;
965
966 // If the delimiter pattern has capturing parentheses, the captured
967 // text goes out into the next n destination strings.
968 int32_t groupNum;
969 for (groupNum=1; groupNum<=numCaptureGroups; groupNum++) {
970 if (i==destCapacity-1) {
971 break;
972 }
973 i++;
974 dest[i] = group(groupNum, status);
975 }
976
977 if (nextOutputStringStart == fActiveLimit) {
978 // The delimiter was at the end of the string. We're done.
979 break;
980 }
981
982 }
983 else
984 {
985 // We ran off the end of the input while looking for the next delimiter.
986 // All the remaining text goes into the current output string.
987 dest[i].setTo(input, nextOutputStringStart, fActiveLimit-nextOutputStringStart);
988 break;
989 }
990 }
991 return i+1;
992 }
993
994
995
996 //--------------------------------------------------------------------------------
997 //
998 // start
999 //
1000 //--------------------------------------------------------------------------------
start(UErrorCode & status) const1001 int32_t RegexMatcher::start(UErrorCode &status) const {
1002 return start(0, status);
1003 }
1004
1005
1006
1007
1008 //--------------------------------------------------------------------------------
1009 //
1010 // start(int32_t group, UErrorCode &status)
1011 //
1012 //--------------------------------------------------------------------------------
start(int32_t group,UErrorCode & status) const1013 int32_t RegexMatcher::start(int32_t group, UErrorCode &status) const {
1014 if (U_FAILURE(status)) {
1015 return -1;
1016 }
1017 if (U_FAILURE(fDeferredStatus)) {
1018 status = fDeferredStatus;
1019 return -1;
1020 }
1021 if (fMatch == FALSE) {
1022 status = U_REGEX_INVALID_STATE;
1023 return -1;
1024 }
1025 if (group < 0 || group > fPattern->fGroupMap->size()) {
1026 status = U_INDEX_OUTOFBOUNDS_ERROR;
1027 return -1;
1028 }
1029 int32_t s;
1030 if (group == 0) {
1031 s = fMatchStart;
1032 } else {
1033 int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1);
1034 U_ASSERT(groupOffset < fPattern->fFrameSize);
1035 U_ASSERT(groupOffset >= 0);
1036 s = fFrame->fExtra[groupOffset];
1037 }
1038 return s;
1039 }
1040
1041
1042
1043 //--------------------------------------------------------------------------------
1044 //
1045 // useAnchoringBounds
1046 //
1047 //--------------------------------------------------------------------------------
useAnchoringBounds(UBool b)1048 RegexMatcher &RegexMatcher::useAnchoringBounds(UBool b) {
1049 fAnchoringBounds = b;
1050 UErrorCode status = U_ZERO_ERROR;
1051 region(fRegionStart, fRegionLimit, status);
1052 U_ASSERT(U_SUCCESS(status));
1053 return *this;
1054 }
1055
1056
1057 //--------------------------------------------------------------------------------
1058 //
1059 // useTransparentBounds
1060 //
1061 //--------------------------------------------------------------------------------
useTransparentBounds(UBool b)1062 RegexMatcher &RegexMatcher::useTransparentBounds(UBool b) {
1063 fTransparentBounds = b;
1064 UErrorCode status = U_ZERO_ERROR;
1065 region(fRegionStart, fRegionLimit, status);
1066 U_ASSERT(U_SUCCESS(status));
1067 return *this;
1068 }
1069
1070
1071 //================================================================================
1072 //
1073 // Code following this point in this file is the internal
1074 // Match Engine Implementation.
1075 //
1076 //================================================================================
1077
1078
1079 //--------------------------------------------------------------------------------
1080 //
1081 // resetStack
1082 // Discard any previous contents of the state save stack, and initialize a
1083 // new stack frame to all -1. The -1s are needed for capture group limits,
1084 // where they indicate that a group has not yet matched anything.
1085 //--------------------------------------------------------------------------------
resetStack()1086 REStackFrame *RegexMatcher::resetStack() {
1087 // Discard any previous contents of the state save stack, and initialize a
1088 // new stack frame to all -1. The -1s are needed for capture group limits, where
1089 // they indicate that a group has not yet matched anything.
1090 fStack->removeAllElements();
1091
1092 int32_t *iFrame = fStack->reserveBlock(fPattern->fFrameSize, fDeferredStatus);
1093 int32_t i;
1094 for (i=0; i<fPattern->fFrameSize; i++) {
1095 iFrame[i] = -1;
1096 }
1097 return (REStackFrame *)iFrame;
1098 }
1099
1100
1101
1102 //--------------------------------------------------------------------------------
1103 //
1104 // isWordBoundary
1105 // in perl, "xab..cd..", \b is true at positions 0,3,5,7
1106 // For us,
1107 // If the current char is a combining mark,
1108 // \b is FALSE.
1109 // Else Scan backwards to the first non-combining char.
1110 // We are at a boundary if the this char and the original chars are
1111 // opposite in membership in \w set
1112 //
1113 // parameters: pos - the current position in the input buffer
1114 //
1115 // TODO: double-check edge cases at region boundaries.
1116 //
1117 //--------------------------------------------------------------------------------
isWordBoundary(int32_t pos)1118 UBool RegexMatcher::isWordBoundary(int32_t pos) {
1119 UBool isBoundary = FALSE;
1120 UBool cIsWord = FALSE;
1121
1122 if (pos >= fLookLimit) {
1123 fHitEnd = TRUE;
1124 } else {
1125 // Determine whether char c at current position is a member of the word set of chars.
1126 // If we're off the end of the string, behave as though we're not at a word char.
1127 UChar32 c = fInput->char32At(pos);
1128 if (u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND) || u_charType(c) == U_FORMAT_CHAR) {
1129 // Current char is a combining one. Not a boundary.
1130 return FALSE;
1131 }
1132 cIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(c);
1133 }
1134
1135 // Back up until we come to a non-combining char, determine whether
1136 // that char is a word char.
1137 UBool prevCIsWord = FALSE;
1138 int32_t prevPos = pos;
1139 for (;;) {
1140 if (prevPos <= fLookStart) {
1141 break;
1142 }
1143 prevPos = fInput->moveIndex32(prevPos, -1);
1144 UChar32 prevChar = fInput->char32At(prevPos);
1145 if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND)
1146 || u_charType(prevChar) == U_FORMAT_CHAR)) {
1147 prevCIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(prevChar);
1148 break;
1149 }
1150 }
1151 isBoundary = cIsWord ^ prevCIsWord;
1152 return isBoundary;
1153 }
1154
1155 //--------------------------------------------------------------------------------
1156 //
1157 // isUWordBoundary
1158 //
1159 // Test for a word boundary using RBBI word break.
1160 //
1161 // parameters: pos - the current position in the input buffer
1162 //
1163 //--------------------------------------------------------------------------------
isUWordBoundary(int32_t pos)1164 UBool RegexMatcher::isUWordBoundary(int32_t pos) {
1165 UBool returnVal = FALSE;
1166 #if UCONFIG_NO_BREAK_ITERATION==0
1167
1168 // If we haven't yet created a break iterator for this matcher, do it now.
1169 if (fWordBreakItr == NULL) {
1170 fWordBreakItr =
1171 (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), fDeferredStatus);
1172 if (U_FAILURE(fDeferredStatus)) {
1173 return FALSE;
1174 }
1175 fWordBreakItr->setText(*fInput);
1176 }
1177
1178 if (pos >= fLookLimit) {
1179 fHitEnd = TRUE;
1180 returnVal = TRUE; // With Unicode word rules, only positions within the interior of "real"
1181 // words are not boundaries. All non-word chars stand by themselves,
1182 // with word boundaries on both sides.
1183 } else {
1184 returnVal = fWordBreakItr->isBoundary(pos);
1185 }
1186 #endif
1187 return returnVal;
1188 }
1189
1190 //--------------------------------------------------------------------------------
1191 //
1192 // StateSave
1193 // Make a new stack frame, initialized as a copy of the current stack frame.
1194 // Set the pattern index in the original stack frame from the operand value
1195 // in the opcode. Execution of the engine continues with the state in
1196 // the newly created stack frame
1197 //
1198 // Note that reserveBlock() may grow the stack, resulting in the
1199 // whole thing being relocated in memory.
1200 //
1201 //--------------------------------------------------------------------------------
StateSave(REStackFrame * fp,int32_t savePatIdx,int32_t frameSize,UErrorCode & status)1202 inline REStackFrame *RegexMatcher::StateSave(REStackFrame *fp, int32_t savePatIdx, int32_t frameSize, UErrorCode &status) {
1203 // push storage for a new frame.
1204 int32_t *newFP = fStack->reserveBlock(frameSize, status);
1205 fp = (REStackFrame *)(newFP - frameSize); // in case of realloc of stack.
1206
1207 // New stack frame = copy of old top frame.
1208 int32_t *source = (int32_t *)fp;
1209 int32_t *dest = newFP;
1210 for (;;) {
1211 *dest++ = *source++;
1212 if (source == newFP) {
1213 break;
1214 }
1215 }
1216
1217 fp->fPatIdx = savePatIdx;
1218 return (REStackFrame *)newFP;
1219 }
1220
1221
1222 //--------------------------------------------------------------------------------
1223 //
1224 // MatchAt This is the actual matching engine.
1225 //
1226 // startIdx: begin matching a this index.
1227 // toEnd: if true, match must extend to end of the input region
1228 //
1229 //--------------------------------------------------------------------------------
MatchAt(int32_t startIdx,UBool toEnd,UErrorCode & status)1230 void RegexMatcher::MatchAt(int32_t startIdx, UBool toEnd, UErrorCode &status) {
1231 UBool isMatch = FALSE; // True if the we have a match.
1232
1233 int32_t op; // Operation from the compiled pattern, split into
1234 int32_t opType; // the opcode
1235 int32_t opValue; // and the operand value.
1236
1237 #ifdef REGEX_RUN_DEBUG
1238 if (fTraceDebug)
1239 {
1240 printf("MatchAt(startIdx=%d)\n", startIdx);
1241 printf("Original Pattern: ");
1242 int32_t i;
1243 for (i=0; i<fPattern->fPattern.length(); i++) {
1244 printf("%c", fPattern->fPattern.charAt(i));
1245 }
1246 printf("\n");
1247 printf("Input String: ");
1248 for (i=0; i<fInput->length(); i++) {
1249 UChar c = fInput->charAt(i);
1250 if (c<32 || c>256) {
1251 c = '.';
1252 }
1253 printf("%c", c);
1254 }
1255 printf("\n");
1256 printf("\n");
1257 }
1258 #endif
1259
1260 if (U_FAILURE(status)) {
1261 return;
1262 }
1263
1264 // Cache frequently referenced items from the compiled pattern
1265 // in local variables.
1266 //
1267 int32_t *pat = fPattern->fCompiledPat->getBuffer();
1268
1269 const UChar *litText = fPattern->fLiteralText.getBuffer();
1270 UVector *sets = fPattern->fSets;
1271
1272 const UChar *inputBuf = fInput->getBuffer();
1273
1274 REStackFrame *fp = resetStack();
1275 int32_t frameSize = fPattern->fFrameSize;
1276
1277 fp->fPatIdx = 0;
1278 fp->fInputIdx = startIdx;
1279
1280 // Zero out the pattern's static data
1281 int32_t i;
1282 for (i = 0; i<fPattern->fDataSize; i++) {
1283 fData[i] = 0;
1284 }
1285
1286 //
1287 // Main loop for interpreting the compiled pattern.
1288 // One iteration of the loop per pattern operation performed.
1289 //
1290 for (;;) {
1291 #if 0
1292 if (_heapchk() != _HEAPOK) {
1293 fprintf(stderr, "Heap Trouble\n");
1294 }
1295 #endif
1296 op = pat[fp->fPatIdx];
1297 opType = URX_TYPE(op);
1298 opValue = URX_VAL(op);
1299 #ifdef REGEX_RUN_DEBUG
1300 if (fTraceDebug) {
1301 printf("inputIdx=%d inputChar=%c sp=%3d ", fp->fInputIdx,
1302 fInput->char32At(fp->fInputIdx), (int32_t *)fp-fStack->getBuffer());
1303 fPattern->dumpOp(fp->fPatIdx);
1304 }
1305 #endif
1306 fp->fPatIdx++;
1307
1308 switch (opType) {
1309
1310
1311 case URX_NOP:
1312 break;
1313
1314
1315 case URX_BACKTRACK:
1316 // Force a backtrack. In some circumstances, the pattern compiler
1317 // will notice that the pattern can't possibly match anything, and will
1318 // emit one of these at that point.
1319 fp = (REStackFrame *)fStack->popFrame(frameSize);
1320 break;
1321
1322
1323 case URX_ONECHAR:
1324 if (fp->fInputIdx < fActiveLimit) {
1325 UChar32 c;
1326 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
1327 if (c == opValue) {
1328 break;
1329 }
1330 } else {
1331 fHitEnd = TRUE;
1332 }
1333 fp = (REStackFrame *)fStack->popFrame(frameSize);
1334 break;
1335
1336
1337 case URX_STRING:
1338 {
1339 // Test input against a literal string.
1340 // Strings require two slots in the compiled pattern, one for the
1341 // offset to the string text, and one for the length.
1342 int32_t stringStartIdx = opValue;
1343 int32_t stringLen;
1344
1345 op = pat[fp->fPatIdx]; // Fetch the second operand
1346 fp->fPatIdx++;
1347 opType = URX_TYPE(op);
1348 stringLen = URX_VAL(op);
1349 U_ASSERT(opType == URX_STRING_LEN);
1350 U_ASSERT(stringLen >= 2);
1351
1352 if (fp->fInputIdx + stringLen > fActiveLimit) {
1353 // No match. String is longer than the remaining input text.
1354 fHitEnd = TRUE; // TODO: See ticket 6074
1355 fp = (REStackFrame *)fStack->popFrame(frameSize);
1356 break;
1357 }
1358
1359 const UChar * pInp = inputBuf + fp->fInputIdx;
1360 const UChar * pPat = litText+stringStartIdx;
1361 const UChar * pEnd = pInp + stringLen;
1362 for(;;) {
1363 if (*pInp == *pPat) {
1364 pInp++;
1365 pPat++;
1366 if (pInp == pEnd) {
1367 // Successful Match.
1368 fp->fInputIdx += stringLen;
1369 break;
1370 }
1371 } else {
1372 // Match failed.
1373 fp = (REStackFrame *)fStack->popFrame(frameSize);
1374 break;
1375 }
1376 }
1377 }
1378 break;
1379
1380
1381
1382 case URX_STATE_SAVE:
1383 fp = StateSave(fp, opValue, frameSize, status);
1384 break;
1385
1386
1387 case URX_END:
1388 // The match loop will exit via this path on a successful match,
1389 // when we reach the end of the pattern.
1390 if (toEnd && fp->fInputIdx != fActiveLimit) {
1391 // The pattern matched, but not to the end of input. Try some more.
1392 fp = (REStackFrame *)fStack->popFrame(frameSize);
1393 break;
1394 }
1395 isMatch = TRUE;
1396 goto breakFromLoop;
1397
1398 // Start and End Capture stack frame variables are layout out like this:
1399 // fp->fExtra[opValue] - The start of a completed capture group
1400 // opValue+1 - The end of a completed capture group
1401 // opValue+2 - the start of a capture group whose end
1402 // has not yet been reached (and might not ever be).
1403 case URX_START_CAPTURE:
1404 U_ASSERT(opValue >= 0 && opValue < frameSize-3);
1405 fp->fExtra[opValue+2] = fp->fInputIdx;
1406 break;
1407
1408
1409 case URX_END_CAPTURE:
1410 U_ASSERT(opValue >= 0 && opValue < frameSize-3);
1411 U_ASSERT(fp->fExtra[opValue+2] >= 0); // Start pos for this group must be set.
1412 fp->fExtra[opValue] = fp->fExtra[opValue+2]; // Tentative start becomes real.
1413 fp->fExtra[opValue+1] = fp->fInputIdx; // End position
1414 U_ASSERT(fp->fExtra[opValue] <= fp->fExtra[opValue+1]);
1415 break;
1416
1417
1418 case URX_DOLLAR: // $, test for End of line
1419 // or for position before new line at end of input
1420 if (fp->fInputIdx < fAnchorLimit-2) {
1421 // We are no where near the end of input. Fail.
1422 // This is the common case. Keep it first.
1423 fp = (REStackFrame *)fStack->popFrame(frameSize);
1424 break;
1425 }
1426 if (fp->fInputIdx >= fAnchorLimit) {
1427 // We really are at the end of input. Success.
1428 fHitEnd = TRUE;
1429 fRequireEnd = TRUE;
1430 break;
1431 }
1432 // If we are positioned just before a new-line that is located at the
1433 // end of input, succeed.
1434 if (fp->fInputIdx == fAnchorLimit-1) {
1435 UChar32 c = fInput->char32At(fp->fInputIdx);
1436 if ((c>=0x0a && c<=0x0d) || c==0x85 || c==0x2028 || c==0x2029) {
1437 // If not in the middle of a CR/LF sequence
1438 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp->fInputIdx-1]==0x0d)) {
1439 // At new-line at end of input. Success
1440 fHitEnd = TRUE;
1441 fRequireEnd = TRUE;
1442 break;
1443 }
1444 }
1445 }
1446
1447 if (fp->fInputIdx == fAnchorLimit-2 &&
1448 fInput->char32At(fp->fInputIdx) == 0x0d && fInput->char32At(fp->fInputIdx+1) == 0x0a) {
1449 fHitEnd = TRUE;
1450 fRequireEnd = TRUE;
1451 break; // At CR/LF at end of input. Success
1452 }
1453
1454 fp = (REStackFrame *)fStack->popFrame(frameSize);
1455
1456 break;
1457
1458
1459 case URX_DOLLAR_D: // $, test for End of Line, in UNIX_LINES mode.
1460 if (fp->fInputIdx >= fAnchorLimit-1) {
1461 // Either at the last character of input, or off the end.
1462 if (fp->fInputIdx == fAnchorLimit-1) {
1463 // At last char of input. Success if it's a new line.
1464 if (fInput->char32At(fp->fInputIdx) == 0x0a) {
1465 fHitEnd = TRUE;
1466 fRequireEnd = TRUE;
1467 break;
1468 }
1469 } else {
1470 // Off the end of input. Success.
1471 fHitEnd = TRUE;
1472 fRequireEnd = TRUE;
1473 break;
1474 }
1475 }
1476
1477 // Not at end of input. Back-track out.
1478 fp = (REStackFrame *)fStack->popFrame(frameSize);
1479 break;
1480
1481
1482 case URX_DOLLAR_M: // $, test for End of line in multi-line mode
1483 {
1484 if (fp->fInputIdx >= fAnchorLimit) {
1485 // We really are at the end of input. Success.
1486 fHitEnd = TRUE;
1487 fRequireEnd = TRUE;
1488 break;
1489 }
1490 // If we are positioned just before a new-line, succeed.
1491 // It makes no difference where the new-line is within the input.
1492 UChar32 c = inputBuf[fp->fInputIdx];
1493 if ((c>=0x0a && c<=0x0d) || c==0x85 ||c==0x2028 || c==0x2029) {
1494 // At a line end, except for the odd chance of being in the middle of a CR/LF sequence
1495 // In multi-line mode, hitting a new-line just before the end of input does not
1496 // set the hitEnd or requireEnd flags
1497 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp->fInputIdx-1]==0x0d)) {
1498 break;
1499 }
1500 }
1501 // not at a new line. Fail.
1502 fp = (REStackFrame *)fStack->popFrame(frameSize);
1503 }
1504 break;
1505
1506
1507 case URX_DOLLAR_MD: // $, test for End of line in multi-line and UNIX_LINES mode
1508 {
1509 if (fp->fInputIdx >= fAnchorLimit) {
1510 // We really are at the end of input. Success.
1511 fHitEnd = TRUE;
1512 fRequireEnd = TRUE; // Java set requireEnd in this case, even though
1513 break; // adding a new-line would not lose the match.
1514 }
1515 // If we are not positioned just before a new-line, the test fails; backtrack out.
1516 // It makes no difference where the new-line is within the input.
1517 if (inputBuf[fp->fInputIdx] != 0x0a) {
1518 fp = (REStackFrame *)fStack->popFrame(frameSize);
1519 }
1520 }
1521 break;
1522
1523
1524 case URX_CARET: // ^, test for start of line
1525 if (fp->fInputIdx != fAnchorStart) {
1526 fp = (REStackFrame *)fStack->popFrame(frameSize);
1527 }
1528 break;
1529
1530
1531 case URX_CARET_M: // ^, test for start of line in mulit-line mode
1532 {
1533 if (fp->fInputIdx == fAnchorStart) {
1534 // We are at the start input. Success.
1535 break;
1536 }
1537 // Check whether character just before the current pos is a new-line
1538 // unless we are at the end of input
1539 UChar c = inputBuf[fp->fInputIdx - 1];
1540 if ((fp->fInputIdx < fAnchorLimit) &&
1541 ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) {
1542 // It's a new-line. ^ is true. Success.
1543 // TODO: what should be done with positions between a CR and LF?
1544 break;
1545 }
1546 // Not at the start of a line. Fail.
1547 fp = (REStackFrame *)fStack->popFrame(frameSize);
1548 }
1549 break;
1550
1551
1552 case URX_CARET_M_UNIX: // ^, test for start of line in mulit-line + Unix-line mode
1553 {
1554 U_ASSERT(fp->fInputIdx >= fAnchorStart);
1555 if (fp->fInputIdx <= fAnchorStart) {
1556 // We are at the start input. Success.
1557 break;
1558 }
1559 // Check whether character just before the current pos is a new-line
1560 U_ASSERT(fp->fInputIdx <= fAnchorLimit);
1561 UChar c = inputBuf[fp->fInputIdx - 1];
1562 if (c != 0x0a) {
1563 // Not at the start of a line. Back-track out.
1564 fp = (REStackFrame *)fStack->popFrame(frameSize);
1565 }
1566 }
1567 break;
1568
1569 case URX_BACKSLASH_B: // Test for word boundaries
1570 {
1571 UBool success = isWordBoundary(fp->fInputIdx);
1572 success ^= (opValue != 0); // flip sense for \B
1573 if (!success) {
1574 fp = (REStackFrame *)fStack->popFrame(frameSize);
1575 }
1576 }
1577 break;
1578
1579
1580 case URX_BACKSLASH_BU: // Test for word boundaries, Unicode-style
1581 {
1582 UBool success = isUWordBoundary(fp->fInputIdx);
1583 success ^= (opValue != 0); // flip sense for \B
1584 if (!success) {
1585 fp = (REStackFrame *)fStack->popFrame(frameSize);
1586 }
1587 }
1588 break;
1589
1590
1591 case URX_BACKSLASH_D: // Test for decimal digit
1592 {
1593 if (fp->fInputIdx >= fActiveLimit) {
1594 fHitEnd = TRUE;
1595 fp = (REStackFrame *)fStack->popFrame(frameSize);
1596 break;
1597 }
1598
1599 UChar32 c = fInput->char32At(fp->fInputIdx);
1600 int8_t ctype = u_charType(c); // TODO: make a unicode set for this. Will be faster.
1601 UBool success = (ctype == U_DECIMAL_DIGIT_NUMBER);
1602 success ^= (opValue != 0); // flip sense for \D
1603 if (success) {
1604 fp->fInputIdx = fInput->moveIndex32(fp->fInputIdx, 1);
1605 } else {
1606 fp = (REStackFrame *)fStack->popFrame(frameSize);
1607 }
1608 }
1609 break;
1610
1611
1612 case URX_BACKSLASH_G: // Test for position at end of previous match
1613 if (!((fMatch && fp->fInputIdx==fMatchEnd) || fMatch==FALSE && fp->fInputIdx==fActiveStart)) {
1614 fp = (REStackFrame *)fStack->popFrame(frameSize);
1615 }
1616 break;
1617
1618
1619 case URX_BACKSLASH_X:
1620 // Match a Grapheme, as defined by Unicode TR 29.
1621 // Differs slightly from Perl, which consumes combining marks independently
1622 // of context.
1623 {
1624
1625 // Fail if at end of input
1626 if (fp->fInputIdx >= fActiveLimit) {
1627 fHitEnd = TRUE;
1628 fp = (REStackFrame *)fStack->popFrame(frameSize);
1629 break;
1630 }
1631
1632 // Examine (and consume) the current char.
1633 // Dispatch into a little state machine, based on the char.
1634 UChar32 c;
1635 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
1636 UnicodeSet **sets = fPattern->fStaticSets;
1637 if (sets[URX_GC_NORMAL]->contains(c)) goto GC_Extend;
1638 if (sets[URX_GC_CONTROL]->contains(c)) goto GC_Control;
1639 if (sets[URX_GC_L]->contains(c)) goto GC_L;
1640 if (sets[URX_GC_LV]->contains(c)) goto GC_V;
1641 if (sets[URX_GC_LVT]->contains(c)) goto GC_T;
1642 if (sets[URX_GC_V]->contains(c)) goto GC_V;
1643 if (sets[URX_GC_T]->contains(c)) goto GC_T;
1644 goto GC_Extend;
1645
1646
1647
1648 GC_L:
1649 if (fp->fInputIdx >= fActiveLimit) goto GC_Done;
1650 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
1651 if (sets[URX_GC_L]->contains(c)) goto GC_L;
1652 if (sets[URX_GC_LV]->contains(c)) goto GC_V;
1653 if (sets[URX_GC_LVT]->contains(c)) goto GC_T;
1654 if (sets[URX_GC_V]->contains(c)) goto GC_V;
1655 U16_PREV(inputBuf, 0, fp->fInputIdx, c);
1656 goto GC_Extend;
1657
1658 GC_V:
1659 if (fp->fInputIdx >= fActiveLimit) goto GC_Done;
1660 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
1661 if (sets[URX_GC_V]->contains(c)) goto GC_V;
1662 if (sets[URX_GC_T]->contains(c)) goto GC_T;
1663 U16_PREV(inputBuf, 0, fp->fInputIdx, c);
1664 goto GC_Extend;
1665
1666 GC_T:
1667 if (fp->fInputIdx >= fActiveLimit) goto GC_Done;
1668 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
1669 if (sets[URX_GC_T]->contains(c)) goto GC_T;
1670 U16_PREV(inputBuf, 0, fp->fInputIdx, c);
1671 goto GC_Extend;
1672
1673 GC_Extend:
1674 // Combining characters are consumed here
1675 for (;;) {
1676 if (fp->fInputIdx >= fActiveLimit) {
1677 break;
1678 }
1679 U16_GET(inputBuf, 0, fp->fInputIdx, fActiveLimit, c);
1680 if (sets[URX_GC_EXTEND]->contains(c) == FALSE) {
1681 break;
1682 }
1683 U16_FWD_1(inputBuf, fp->fInputIdx, fActiveLimit);
1684 }
1685 goto GC_Done;
1686
1687 GC_Control:
1688 // Most control chars stand alone (don't combine with combining chars),
1689 // except for that CR/LF sequence is a single grapheme cluster.
1690 if (c == 0x0d && fp->fInputIdx < fActiveLimit && inputBuf[fp->fInputIdx] == 0x0a) {
1691 fp->fInputIdx++;
1692 }
1693
1694 GC_Done:
1695 if (fp->fInputIdx >= fActiveLimit) {
1696 fHitEnd = TRUE;
1697 }
1698 break;
1699 }
1700
1701
1702
1703
1704 case URX_BACKSLASH_Z: // Test for end of Input
1705 if (fp->fInputIdx < fAnchorLimit) {
1706 fp = (REStackFrame *)fStack->popFrame(frameSize);
1707 } else {
1708 fHitEnd = TRUE;
1709 fRequireEnd = TRUE;
1710 }
1711 break;
1712
1713
1714
1715 case URX_STATIC_SETREF:
1716 {
1717 // Test input character against one of the predefined sets
1718 // (Word Characters, for example)
1719 // The high bit of the op value is a flag for the match polarity.
1720 // 0: success if input char is in set.
1721 // 1: success if input char is not in set.
1722 if (fp->fInputIdx >= fActiveLimit) {
1723 fHitEnd = TRUE;
1724 fp = (REStackFrame *)fStack->popFrame(frameSize);
1725 break;
1726 }
1727
1728 UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET);
1729 opValue &= ~URX_NEG_SET;
1730 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
1731 UChar32 c;
1732 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
1733 if (c < 256) {
1734 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
1735 if (s8->contains(c)) {
1736 success = !success;
1737 }
1738 } else {
1739 const UnicodeSet *s = fPattern->fStaticSets[opValue];
1740 if (s->contains(c)) {
1741 success = !success;
1742 }
1743 }
1744 if (!success) {
1745 fp = (REStackFrame *)fStack->popFrame(frameSize);
1746 }
1747 }
1748 break;
1749
1750
1751 case URX_STAT_SETREF_N:
1752 {
1753 // Test input character for NOT being a member of one of
1754 // the predefined sets (Word Characters, for example)
1755 if (fp->fInputIdx >= fActiveLimit) {
1756 fHitEnd = TRUE;
1757 fp = (REStackFrame *)fStack->popFrame(frameSize);
1758 break;
1759 }
1760
1761 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
1762 UChar32 c;
1763 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
1764 if (c < 256) {
1765 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
1766 if (s8->contains(c) == FALSE) {
1767 break;
1768 }
1769 } else {
1770 const UnicodeSet *s = fPattern->fStaticSets[opValue];
1771 if (s->contains(c) == FALSE) {
1772 break;
1773 }
1774 }
1775
1776 fp = (REStackFrame *)fStack->popFrame(frameSize);
1777 }
1778 break;
1779
1780
1781 case URX_SETREF:
1782 if (fp->fInputIdx >= fActiveLimit) {
1783 fHitEnd = TRUE;
1784 fp = (REStackFrame *)fStack->popFrame(frameSize);
1785 break;
1786 }
1787 // There is input left. Pick up one char and test it for set membership.
1788 UChar32 c;
1789 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
1790 U_ASSERT(opValue > 0 && opValue < sets->size());
1791 if (c<256) {
1792 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
1793 if (s8->contains(c)) {
1794 break;
1795 }
1796 } else {
1797 UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue);
1798 if (s->contains(c)) {
1799 // The character is in the set. A Match.
1800 break;
1801 }
1802 }
1803 // the character wasn't in the set. Back track out.
1804 fp = (REStackFrame *)fStack->popFrame(frameSize);
1805 break;
1806
1807
1808 case URX_DOTANY:
1809 {
1810 // . matches anything, but stops at end-of-line.
1811 if (fp->fInputIdx >= fActiveLimit) {
1812 // At end of input. Match failed. Backtrack out.
1813 fHitEnd = TRUE;
1814 fp = (REStackFrame *)fStack->popFrame(frameSize);
1815 break;
1816 }
1817 // There is input left. Advance over one char, unless we've hit end-of-line
1818 UChar32 c;
1819 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
1820 if (((c & 0x7f) <= 0x29) && // First quickly bypass as many chars as possible
1821 ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) {
1822 // End of line in normal mode. . does not match.
1823 fp = (REStackFrame *)fStack->popFrame(frameSize);
1824 break;
1825 }
1826 }
1827 break;
1828
1829
1830 case URX_DOTANY_ALL:
1831 {
1832 // ., in dot-matches-all (including new lines) mode
1833 if (fp->fInputIdx >= fActiveLimit) {
1834 // At end of input. Match failed. Backtrack out.
1835 fHitEnd = TRUE;
1836 fp = (REStackFrame *)fStack->popFrame(frameSize);
1837 break;
1838 }
1839 // There is input left. Advance over one char, except if we are
1840 // at a cr/lf, advance over both of them.
1841 UChar32 c;
1842 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
1843 if (c==0x0d && fp->fInputIdx < fActiveLimit) {
1844 // In the case of a CR/LF, we need to advance over both.
1845 UChar nextc = inputBuf[fp->fInputIdx];
1846 if (nextc == 0x0a) {
1847 fp->fInputIdx++;
1848 }
1849 }
1850 }
1851 break;
1852
1853
1854 case URX_DOTANY_UNIX:
1855 {
1856 // '.' operator, matches all, but stops at end-of-line.
1857 // UNIX_LINES mode, so 0x0a is the only recognized line ending.
1858 if (fp->fInputIdx >= fActiveLimit) {
1859 // At end of input. Match failed. Backtrack out.
1860 fHitEnd = TRUE;
1861 fp = (REStackFrame *)fStack->popFrame(frameSize);
1862 break;
1863 }
1864 // There is input left. Advance over one char, unless we've hit end-of-line
1865 UChar32 c;
1866 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
1867 if (c == 0x0a) {
1868 // End of line in normal mode. '.' does not match the \n
1869 fp = (REStackFrame *)fStack->popFrame(frameSize);
1870 }
1871 }
1872 break;
1873
1874
1875 case URX_JMP:
1876 fp->fPatIdx = opValue;
1877 break;
1878
1879 case URX_FAIL:
1880 isMatch = FALSE;
1881 goto breakFromLoop;
1882
1883 case URX_JMP_SAV:
1884 U_ASSERT(opValue < fPattern->fCompiledPat->size());
1885 fp = StateSave(fp, fp->fPatIdx, frameSize, status); // State save to loc following current
1886 fp->fPatIdx = opValue; // Then JMP.
1887 break;
1888
1889 case URX_JMP_SAV_X:
1890 // This opcode is used with (x)+, when x can match a zero length string.
1891 // Same as JMP_SAV, except conditional on the match having made forward progress.
1892 // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the
1893 // data address of the input position at the start of the loop.
1894 {
1895 U_ASSERT(opValue > 0 && opValue < fPattern->fCompiledPat->size());
1896 int32_t stoOp = pat[opValue-1];
1897 U_ASSERT(URX_TYPE(stoOp) == URX_STO_INP_LOC);
1898 int32_t frameLoc = URX_VAL(stoOp);
1899 U_ASSERT(frameLoc >= 0 && frameLoc < frameSize);
1900 int32_t prevInputIdx = fp->fExtra[frameLoc];
1901 U_ASSERT(prevInputIdx <= fp->fInputIdx);
1902 if (prevInputIdx < fp->fInputIdx) {
1903 // The match did make progress. Repeat the loop.
1904 fp = StateSave(fp, fp->fPatIdx, frameSize, status); // State save to loc following current
1905 fp->fPatIdx = opValue;
1906 fp->fExtra[frameLoc] = fp->fInputIdx;
1907 }
1908 // If the input position did not advance, we do nothing here,
1909 // execution will fall out of the loop.
1910 }
1911 break;
1912
1913 case URX_CTR_INIT:
1914 {
1915 U_ASSERT(opValue >= 0 && opValue < frameSize-2);
1916 fp->fExtra[opValue] = 0; // Set the loop counter variable to zero
1917
1918 // Pick up the three extra operands that CTR_INIT has, and
1919 // skip the pattern location counter past
1920 int32_t instrOperandLoc = fp->fPatIdx;
1921 fp->fPatIdx += 3;
1922 int32_t loopLoc = URX_VAL(pat[instrOperandLoc]);
1923 int32_t minCount = pat[instrOperandLoc+1];
1924 int32_t maxCount = pat[instrOperandLoc+2];
1925 U_ASSERT(minCount>=0);
1926 U_ASSERT(maxCount>=minCount || maxCount==-1);
1927 U_ASSERT(loopLoc>fp->fPatIdx);
1928
1929 if (minCount == 0) {
1930 fp = StateSave(fp, loopLoc+1, frameSize, status);
1931 }
1932 if (maxCount == 0) {
1933 fp = (REStackFrame *)fStack->popFrame(frameSize);
1934 }
1935 }
1936 break;
1937
1938 case URX_CTR_LOOP:
1939 {
1940 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
1941 int32_t initOp = pat[opValue];
1942 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT);
1943 int32_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
1944 int32_t minCount = pat[opValue+2];
1945 int32_t maxCount = pat[opValue+3];
1946 // Increment the counter. Note: we're not worrying about counter
1947 // overflow, since the data comes from UnicodeStrings, which
1948 // stores its length in an int32_t.
1949 (*pCounter)++;
1950 U_ASSERT(*pCounter > 0);
1951 if ((uint32_t)*pCounter >= (uint32_t)maxCount) {
1952 U_ASSERT(*pCounter == maxCount || maxCount == -1);
1953 break;
1954 }
1955 if (*pCounter >= minCount) {
1956 fp = StateSave(fp, fp->fPatIdx, frameSize, status);
1957 }
1958 fp->fPatIdx = opValue + 4; // Loop back.
1959 }
1960 break;
1961
1962 case URX_CTR_INIT_NG:
1963 {
1964 // Initialize a non-greedy loop
1965 U_ASSERT(opValue >= 0 && opValue < frameSize-2);
1966 fp->fExtra[opValue] = 0; // Set the loop counter variable to zero
1967
1968 // Pick up the three extra operands that CTR_INIT has, and
1969 // skip the pattern location counter past
1970 int32_t instrOperandLoc = fp->fPatIdx;
1971 fp->fPatIdx += 3;
1972 int32_t loopLoc = URX_VAL(pat[instrOperandLoc]);
1973 int32_t minCount = pat[instrOperandLoc+1];
1974 int32_t maxCount = pat[instrOperandLoc+2];
1975 U_ASSERT(minCount>=0);
1976 U_ASSERT(maxCount>=minCount || maxCount==-1);
1977 U_ASSERT(loopLoc>fp->fPatIdx);
1978
1979 if (minCount == 0) {
1980 if (maxCount != 0) {
1981 fp = StateSave(fp, fp->fPatIdx, frameSize, status);
1982 }
1983 fp->fPatIdx = loopLoc+1; // Continue with stuff after repeated block
1984 }
1985 }
1986 break;
1987
1988 case URX_CTR_LOOP_NG:
1989 {
1990 // Non-greedy {min, max} loops
1991 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
1992 int32_t initOp = pat[opValue];
1993 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT_NG);
1994 int32_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
1995 int32_t minCount = pat[opValue+2];
1996 int32_t maxCount = pat[opValue+3];
1997 // Increment the counter. Note: we're not worrying about counter
1998 // overflow, since the data comes from UnicodeStrings, which
1999 // stores its length in an int32_t.
2000 (*pCounter)++;
2001 U_ASSERT(*pCounter > 0);
2002
2003 if ((uint32_t)*pCounter >= (uint32_t)maxCount) {
2004 // The loop has matched the maximum permitted number of times.
2005 // Break out of here with no action. Matching will
2006 // continue with the following pattern.
2007 U_ASSERT(*pCounter == maxCount || maxCount == -1);
2008 break;
2009 }
2010
2011 if (*pCounter < minCount) {
2012 // We haven't met the minimum number of matches yet.
2013 // Loop back for another one.
2014 fp->fPatIdx = opValue + 4; // Loop back.
2015 } else {
2016 // We do have the minimum number of matches.
2017 // Fall into the following pattern, but first do
2018 // a state save to the top of the loop, so that a failure
2019 // in the following pattern will try another iteration of the loop.
2020 fp = StateSave(fp, opValue + 4, frameSize, status);
2021 }
2022 }
2023 break;
2024
2025 case URX_STO_SP:
2026 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
2027 fData[opValue] = fStack->size();
2028 break;
2029
2030 case URX_LD_SP:
2031 {
2032 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
2033 int32_t newStackSize = fData[opValue];
2034 U_ASSERT(newStackSize <= fStack->size());
2035 int32_t *newFP = fStack->getBuffer() + newStackSize - frameSize;
2036 if (newFP == (int32_t *)fp) {
2037 break;
2038 }
2039 int32_t i;
2040 for (i=0; i<frameSize; i++) {
2041 newFP[i] = ((int32_t *)fp)[i];
2042 }
2043 fp = (REStackFrame *)newFP;
2044 fStack->setSize(newStackSize);
2045 }
2046 break;
2047
2048 case URX_BACKREF:
2049 case URX_BACKREF_I:
2050 {
2051 U_ASSERT(opValue < frameSize);
2052 int32_t groupStartIdx = fp->fExtra[opValue];
2053 int32_t groupEndIdx = fp->fExtra[opValue+1];
2054 U_ASSERT(groupStartIdx <= groupEndIdx);
2055 int32_t len = groupEndIdx-groupStartIdx;
2056 if (groupStartIdx < 0) {
2057 // This capture group has not participated in the match thus far,
2058 fp = (REStackFrame *)fStack->popFrame(frameSize); // FAIL, no match.
2059 }
2060
2061 if (len == 0) {
2062 // The capture group match was of an empty string.
2063 // Verified by testing: Perl matches succeed in this case, so
2064 // we do too.
2065 break;
2066 }
2067
2068 UBool haveMatch = FALSE;
2069 if (fp->fInputIdx + len <= fActiveLimit) {
2070 if (opType == URX_BACKREF) {
2071 if (u_strncmp(inputBuf+groupStartIdx, inputBuf+fp->fInputIdx, len) == 0) {
2072 haveMatch = TRUE;
2073 }
2074 } else {
2075 if (u_strncasecmp(inputBuf+groupStartIdx, inputBuf+fp->fInputIdx,
2076 len, U_FOLD_CASE_DEFAULT) == 0) {
2077 haveMatch = TRUE;
2078 }
2079 }
2080 } else {
2081 // TODO: probably need to do a partial string comparison, and only
2082 // set HitEnd if the available input matched. Ticket #6074
2083 fHitEnd = TRUE;
2084 }
2085 if (haveMatch) {
2086 fp->fInputIdx += len; // Match. Advance current input position.
2087 } else {
2088 fp = (REStackFrame *)fStack->popFrame(frameSize); // FAIL, no match.
2089 }
2090 }
2091 break;
2092
2093 case URX_STO_INP_LOC:
2094 {
2095 U_ASSERT(opValue >= 0 && opValue < frameSize);
2096 fp->fExtra[opValue] = fp->fInputIdx;
2097 }
2098 break;
2099
2100 case URX_JMPX:
2101 {
2102 int32_t instrOperandLoc = fp->fPatIdx;
2103 fp->fPatIdx += 1;
2104 int32_t dataLoc = URX_VAL(pat[instrOperandLoc]);
2105 U_ASSERT(dataLoc >= 0 && dataLoc < frameSize);
2106 int32_t savedInputIdx = fp->fExtra[dataLoc];
2107 U_ASSERT(savedInputIdx <= fp->fInputIdx);
2108 if (savedInputIdx < fp->fInputIdx) {
2109 fp->fPatIdx = opValue; // JMP
2110 } else {
2111 fp = (REStackFrame *)fStack->popFrame(frameSize); // FAIL, no progress in loop.
2112 }
2113 }
2114 break;
2115
2116 case URX_LA_START:
2117 {
2118 // Entering a lookahead block.
2119 // Save Stack Ptr, Input Pos.
2120 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
2121 fData[opValue] = fStack->size();
2122 fData[opValue+1] = fp->fInputIdx;
2123 fActiveStart = fLookStart; // Set the match region change for
2124 fActiveLimit = fLookLimit; // transparent bounds.
2125 }
2126 break;
2127
2128 case URX_LA_END:
2129 {
2130 // Leaving a look-ahead block.
2131 // restore Stack Ptr, Input Pos to positions they had on entry to block.
2132 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
2133 int32_t stackSize = fStack->size();
2134 int32_t newStackSize = fData[opValue];
2135 U_ASSERT(stackSize >= newStackSize);
2136 if (stackSize > newStackSize) {
2137 // Copy the current top frame back to the new (cut back) top frame.
2138 // This makes the capture groups from within the look-ahead
2139 // expression available.
2140 int32_t *newFP = fStack->getBuffer() + newStackSize - frameSize;
2141 int32_t i;
2142 for (i=0; i<frameSize; i++) {
2143 newFP[i] = ((int32_t *)fp)[i];
2144 }
2145 fp = (REStackFrame *)newFP;
2146 fStack->setSize(newStackSize);
2147 }
2148 fp->fInputIdx = fData[opValue+1];
2149
2150 // Restore the active region bounds in the input string; they may have
2151 // been changed because of transparent bounds on a Region.
2152 fActiveStart = fRegionStart;
2153 fActiveLimit = fRegionLimit;
2154 }
2155 break;
2156
2157 case URX_ONECHAR_I:
2158 if (fp->fInputIdx < fActiveLimit) {
2159 UChar32 c;
2160 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
2161 if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) {
2162 break;
2163 }
2164 } else {
2165 fHitEnd = TRUE;
2166 }
2167 fp = (REStackFrame *)fStack->popFrame(frameSize);
2168 break;
2169
2170 case URX_STRING_I:
2171 {
2172 // Test input against a literal string.
2173 // Strings require two slots in the compiled pattern, one for the
2174 // offset to the string text, and one for the length.
2175 int32_t stringStartIdx, stringLen;
2176 stringStartIdx = opValue;
2177
2178 op = pat[fp->fPatIdx];
2179 fp->fPatIdx++;
2180 opType = URX_TYPE(op);
2181 opValue = URX_VAL(op);
2182 U_ASSERT(opType == URX_STRING_LEN);
2183 stringLen = opValue;
2184
2185 int32_t stringEndIndex = fp->fInputIdx + stringLen;
2186 if (stringEndIndex <= fActiveLimit) {
2187 if (u_strncasecmp(inputBuf+fp->fInputIdx, litText+stringStartIdx,
2188 stringLen, U_FOLD_CASE_DEFAULT) == 0) {
2189 // Success. Advance the current input position.
2190 fp->fInputIdx = stringEndIndex;
2191 break;
2192 }
2193 } else {
2194 // Insufficent input left for a match.
2195 fHitEnd = TRUE; // See ticket 6074
2196 }
2197 // No match. Back up matching to a saved state
2198 fp = (REStackFrame *)fStack->popFrame(frameSize);
2199 }
2200 break;
2201
2202 case URX_LB_START:
2203 {
2204 // Entering a look-behind block.
2205 // Save Stack Ptr, Input Pos.
2206 // TODO: implement transparent bounds. Ticket #6067
2207 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
2208 fData[opValue] = fStack->size();
2209 fData[opValue+1] = fp->fInputIdx;
2210 // Init the variable containing the start index for attempted matches.
2211 fData[opValue+2] = -1;
2212 // Save input string length, then reset to pin any matches to end at
2213 // the current position.
2214 fData[opValue+3] = fActiveLimit;
2215 fActiveLimit = fp->fInputIdx;
2216 }
2217 break;
2218
2219
2220 case URX_LB_CONT:
2221 {
2222 // Positive Look-Behind, at top of loop checking for matches of LB expression
2223 // at all possible input starting positions.
2224
2225 // Fetch the min and max possible match lengths. They are the operands
2226 // of this op in the pattern.
2227 int32_t minML = pat[fp->fPatIdx++];
2228 int32_t maxML = pat[fp->fPatIdx++];
2229 U_ASSERT(minML <= maxML);
2230 U_ASSERT(minML >= 0);
2231
2232 // Fetch (from data) the last input index where a match was attempted.
2233 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
2234 int32_t *lbStartIdx = &fData[opValue+2];
2235 if (*lbStartIdx < 0) {
2236 // First time through loop.
2237 *lbStartIdx = fp->fInputIdx - minML;
2238 } else {
2239 // 2nd through nth time through the loop.
2240 // Back up start position for match by one.
2241 if (*lbStartIdx == 0) {
2242 (*lbStartIdx)--; // Because U16_BACK is unsafe starting at 0.
2243 } else {
2244 U16_BACK_1(inputBuf, 0, *lbStartIdx);
2245 }
2246 }
2247
2248 if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) {
2249 // We have tried all potential match starting points without
2250 // getting a match. Backtrack out, and out of the
2251 // Look Behind altogether.
2252 fp = (REStackFrame *)fStack->popFrame(frameSize);
2253 int32_t restoreInputLen = fData[opValue+3];
2254 U_ASSERT(restoreInputLen >= fActiveLimit);
2255 U_ASSERT(restoreInputLen <= fInput->length());
2256 fActiveLimit = restoreInputLen;
2257 break;
2258 }
2259
2260 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
2261 // (successful match will fall off the end of the loop.)
2262 fp = StateSave(fp, fp->fPatIdx-3, frameSize, status);
2263 fp->fInputIdx = *lbStartIdx;
2264 }
2265 break;
2266
2267 case URX_LB_END:
2268 // End of a look-behind block, after a successful match.
2269 {
2270 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
2271 if (fp->fInputIdx != fActiveLimit) {
2272 // The look-behind expression matched, but the match did not
2273 // extend all the way to the point that we are looking behind from.
2274 // FAIL out of here, which will take us back to the LB_CONT, which
2275 // will retry the match starting at another position or fail
2276 // the look-behind altogether, whichever is appropriate.
2277 fp = (REStackFrame *)fStack->popFrame(frameSize);
2278 break;
2279 }
2280
2281 // Look-behind match is good. Restore the orignal input string length,
2282 // which had been truncated to pin the end of the lookbehind match to the
2283 // position being looked-behind.
2284 int32_t originalInputLen = fData[opValue+3];
2285 U_ASSERT(originalInputLen >= fActiveLimit);
2286 U_ASSERT(originalInputLen <= fInput->length());
2287 fActiveLimit = originalInputLen;
2288 }
2289 break;
2290
2291
2292 case URX_LBN_CONT:
2293 {
2294 // Negative Look-Behind, at top of loop checking for matches of LB expression
2295 // at all possible input starting positions.
2296
2297 // Fetch the extra parameters of this op.
2298 int32_t minML = pat[fp->fPatIdx++];
2299 int32_t maxML = pat[fp->fPatIdx++];
2300 int32_t continueLoc = pat[fp->fPatIdx++];
2301 continueLoc = URX_VAL(continueLoc);
2302 U_ASSERT(minML <= maxML);
2303 U_ASSERT(minML >= 0);
2304 U_ASSERT(continueLoc > fp->fPatIdx);
2305
2306 // Fetch (from data) the last input index where a match was attempted.
2307 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
2308 int32_t *lbStartIdx = &fData[opValue+2];
2309 if (*lbStartIdx < 0) {
2310 // First time through loop.
2311 *lbStartIdx = fp->fInputIdx - minML;
2312 } else {
2313 // 2nd through nth time through the loop.
2314 // Back up start position for match by one.
2315 if (*lbStartIdx == 0) {
2316 (*lbStartIdx)--; // Because U16_BACK is unsafe starting at 0.
2317 } else {
2318 U16_BACK_1(inputBuf, 0, *lbStartIdx);
2319 }
2320 }
2321
2322 if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) {
2323 // We have tried all potential match starting points without
2324 // getting a match, which means that the negative lookbehind as
2325 // a whole has succeeded. Jump forward to the continue location
2326 int32_t restoreInputLen = fData[opValue+3];
2327 U_ASSERT(restoreInputLen >= fActiveLimit);
2328 U_ASSERT(restoreInputLen <= fInput->length());
2329 fActiveLimit = restoreInputLen;
2330 fp->fPatIdx = continueLoc;
2331 break;
2332 }
2333
2334 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
2335 // (successful match will cause a FAIL out of the loop altogether.)
2336 fp = StateSave(fp, fp->fPatIdx-4, frameSize, status);
2337 fp->fInputIdx = *lbStartIdx;
2338 }
2339 break;
2340
2341 case URX_LBN_END:
2342 // End of a negative look-behind block, after a successful match.
2343 {
2344 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
2345 if (fp->fInputIdx != fActiveLimit) {
2346 // The look-behind expression matched, but the match did not
2347 // extend all the way to the point that we are looking behind from.
2348 // FAIL out of here, which will take us back to the LB_CONT, which
2349 // will retry the match starting at another position or succeed
2350 // the look-behind altogether, whichever is appropriate.
2351 fp = (REStackFrame *)fStack->popFrame(frameSize);
2352 break;
2353 }
2354
2355 // Look-behind expression matched, which means look-behind test as
2356 // a whole Fails
2357
2358 // Restore the orignal input string length, which had been truncated
2359 // inorder to pin the end of the lookbehind match
2360 // to the position being looked-behind.
2361 int32_t originalInputLen = fData[opValue+3];
2362 U_ASSERT(originalInputLen >= fActiveLimit);
2363 U_ASSERT(originalInputLen <= fInput->length());
2364 fActiveLimit = originalInputLen;
2365
2366 // Restore original stack position, discarding any state saved
2367 // by the successful pattern match.
2368 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
2369 int32_t newStackSize = fData[opValue];
2370 U_ASSERT(fStack->size() > newStackSize);
2371 fStack->setSize(newStackSize);
2372
2373 // FAIL, which will take control back to someplace
2374 // prior to entering the look-behind test.
2375 fp = (REStackFrame *)fStack->popFrame(frameSize);
2376 }
2377 break;
2378
2379
2380 case URX_LOOP_SR_I:
2381 // Loop Initialization for the optimized implementation of
2382 // [some character set]*
2383 // This op scans through all matching input.
2384 // The following LOOP_C op emulates stack unwinding if the following pattern fails.
2385 {
2386 U_ASSERT(opValue > 0 && opValue < sets->size());
2387 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
2388 UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue);
2389
2390 // Loop through input, until either the input is exhausted or
2391 // we reach a character that is not a member of the set.
2392 int32_t ix = fp->fInputIdx;
2393 for (;;) {
2394 if (ix >= fActiveLimit) {
2395 fHitEnd = TRUE;
2396 break;
2397 }
2398 UChar32 c;
2399 U16_NEXT(inputBuf, ix, fActiveLimit, c);
2400 if (c<256) {
2401 if (s8->contains(c) == FALSE) {
2402 U16_BACK_1(inputBuf, 0, ix);
2403 break;
2404 }
2405 } else {
2406 if (s->contains(c) == FALSE) {
2407 U16_BACK_1(inputBuf, 0, ix);
2408 break;
2409 }
2410 }
2411 }
2412
2413 // If there were no matching characters, skip over the loop altogether.
2414 // The loop doesn't run at all, a * op always succeeds.
2415 if (ix == fp->fInputIdx) {
2416 fp->fPatIdx++; // skip the URX_LOOP_C op.
2417 break;
2418 }
2419
2420 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
2421 // must follow. It's operand is the stack location
2422 // that holds the starting input index for the match of this [set]*
2423 int32_t loopcOp = pat[fp->fPatIdx];
2424 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
2425 int32_t stackLoc = URX_VAL(loopcOp);
2426 U_ASSERT(stackLoc >= 0 && stackLoc < frameSize);
2427 fp->fExtra[stackLoc] = fp->fInputIdx;
2428 fp->fInputIdx = ix;
2429
2430 // Save State to the URX_LOOP_C op that follows this one,
2431 // so that match failures in the following code will return to there.
2432 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
2433 fp = StateSave(fp, fp->fPatIdx, frameSize, status);
2434 fp->fPatIdx++;
2435 }
2436 break;
2437
2438
2439 case URX_LOOP_DOT_I:
2440 // Loop Initialization for the optimized implementation of .*
2441 // This op scans through all remaining input.
2442 // The following LOOP_C op emulates stack unwinding if the following pattern fails.
2443 {
2444 // Loop through input until the input is exhausted (we reach an end-of-line)
2445 // In DOTALL mode, we can just go straight to the end of the input.
2446 int32_t ix;
2447 if ((opValue & 1) == 1) {
2448 // Dot-matches-All mode. Jump straight to the end of the string.
2449 ix = fActiveLimit;
2450 fHitEnd = TRUE;
2451 } else {
2452 // NOT DOT ALL mode. Line endings do not match '.'
2453 // Scan forward until a line ending or end of input.
2454 ix = fp->fInputIdx;
2455 for (;;) {
2456 if (ix >= fActiveLimit) {
2457 fHitEnd = TRUE;
2458 ix = fActiveLimit;
2459 break;
2460 }
2461 UChar32 c;
2462 U16_NEXT(inputBuf, ix, fActiveLimit, c); // c = inputBuf[ix++]
2463 if ((c & 0x7f) <= 0x29) { // Fast filter of non-new-line-s
2464 if ((c == 0x0a) || // 0x0a is newline in both modes.
2465 ((opValue & 2) == 0) && // IF not UNIX_LINES mode
2466 (c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029) {
2467 // char is a line ending. Put the input pos back to the
2468 // line ending char, and exit the scanning loop.
2469 U16_BACK_1(inputBuf, 0, ix);
2470 break;
2471 }
2472 }
2473 }
2474 }
2475
2476 // If there were no matching characters, skip over the loop altogether.
2477 // The loop doesn't run at all, a * op always succeeds.
2478 if (ix == fp->fInputIdx) {
2479 fp->fPatIdx++; // skip the URX_LOOP_C op.
2480 break;
2481 }
2482
2483 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
2484 // must follow. It's operand is the stack location
2485 // that holds the starting input index for the match of this .*
2486 int32_t loopcOp = pat[fp->fPatIdx];
2487 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
2488 int32_t stackLoc = URX_VAL(loopcOp);
2489 U_ASSERT(stackLoc >= 0 && stackLoc < frameSize);
2490 fp->fExtra[stackLoc] = fp->fInputIdx;
2491 fp->fInputIdx = ix;
2492
2493 // Save State to the URX_LOOP_C op that follows this one,
2494 // so that match failures in the following code will return to there.
2495 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
2496 fp = StateSave(fp, fp->fPatIdx, frameSize, status);
2497 fp->fPatIdx++;
2498 }
2499 break;
2500
2501
2502 case URX_LOOP_C:
2503 {
2504 U_ASSERT(opValue>=0 && opValue<frameSize);
2505 int32_t terminalIdx = fp->fExtra[opValue];
2506 U_ASSERT(terminalIdx <= fp->fInputIdx);
2507 if (terminalIdx == fp->fInputIdx) {
2508 // We've backed up the input idx to the point that the loop started.
2509 // The loop is done. Leave here without saving state.
2510 // Subsequent failures won't come back here.
2511 break;
2512 }
2513 // Set up for the next iteration of the loop, with input index
2514 // backed up by one from the last time through,
2515 // and a state save to this instruction in case the following code fails again.
2516 // (We're going backwards because this loop emulates stack unwinding, not
2517 // the initial scan forward.)
2518 U_ASSERT(fp->fInputIdx > 0);
2519 U16_BACK_1(inputBuf, 0, fp->fInputIdx);
2520 if (inputBuf[fp->fInputIdx] == 0x0a &&
2521 fp->fInputIdx > terminalIdx &&
2522 inputBuf[fp->fInputIdx-1] == 0x0d) {
2523 int32_t prevOp = pat[fp->fPatIdx-2];
2524 if (URX_TYPE(prevOp) == URX_LOOP_DOT_I) {
2525 // .*, stepping back over CRLF pair.
2526 fp->fInputIdx--;
2527 }
2528 }
2529
2530
2531 fp = StateSave(fp, fp->fPatIdx-1, frameSize, status);
2532 }
2533 break;
2534
2535
2536
2537 default:
2538 // Trouble. The compiled pattern contains an entry with an
2539 // unrecognized type tag.
2540 U_ASSERT(FALSE);
2541 }
2542
2543 if (U_FAILURE(status)) {
2544 break;
2545 }
2546 }
2547
2548 breakFromLoop:
2549 fMatch = isMatch;
2550 if (isMatch) {
2551 fLastMatchEnd = fMatchEnd;
2552 fMatchStart = startIdx;
2553 fMatchEnd = fp->fInputIdx;
2554 if (fTraceDebug) {
2555 REGEX_RUN_DEBUG_PRINTF(("Match. start=%d end=%d\n\n", fMatchStart, fMatchEnd));
2556 }
2557 }
2558 else
2559 {
2560 if (fTraceDebug) {
2561 REGEX_RUN_DEBUG_PRINTF(("No match\n\n"));
2562 }
2563 }
2564
2565 fFrame = fp; // The active stack frame when the engine stopped.
2566 // Contains the capture group results that we need to
2567 // access later.
2568
2569 return;
2570 }
2571
2572
2573
2574 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RegexMatcher)
2575
2576 U_NAMESPACE_END
2577
2578 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS
2579
2580