1 /*
2 **************************************************************************
3 * Copyright (C) 2002-2010 International Business Machines Corporation *
4 * and others. All rights reserved. *
5 **************************************************************************
6 */
7 //
8 // file: rematch.cpp
9 //
10 // Contains the implementation of class RegexMatcher,
11 // which is one of the main API classes for the ICU regular expression package.
12 //
13
14 #include "unicode/utypes.h"
15 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
16
17 #include "unicode/regex.h"
18 #include "unicode/uniset.h"
19 #include "unicode/uchar.h"
20 #include "unicode/ustring.h"
21 #include "unicode/rbbi.h"
22 #include "uassert.h"
23 #include "cmemory.h"
24 #include "uvector.h"
25 #include "uvectr32.h"
26 #include "uvectr64.h"
27 #include "regeximp.h"
28 #include "regexst.h"
29 #include "regextxt.h"
30 #include "ucase.h"
31
32 // #include <malloc.h> // Needed for heapcheck testing
33
34
35 // Find progress callback
36 // ----------------------
37 // Macro to inline test & call to ReportFindProgress(). Eliminates unnecessary function call.
38 //
39 #define REGEXFINDPROGRESS_INTERRUPT(pos, status) \
40 (fFindProgressCallbackFn != NULL) && (ReportFindProgress(pos, status) == FALSE)
41
42
43 // Smart Backtracking
44 // ------------------
45 // When a failure would go back to a LOOP_C instruction,
46 // strings, characters, and setrefs scan backwards for a valid start
47 // character themselves, pop the stack, and save state, emulating the
48 // LOOP_C's effect but assured that the next character of input is a
49 // possible matching character.
50 //
51 // Good idea in theory; unfortunately it only helps out a few specific
52 // cases and slows the engine down a little in the rest.
53
54 //#define REGEX_SMART_BACKTRACKING 1
55
56 U_NAMESPACE_BEGIN
57
58 // Default limit for the size of the back track stack, to avoid system
59 // failures causedby heap exhaustion. Units are in 32 bit words, not bytes.
60 // This value puts ICU's limits higher than most other regexp implementations,
61 // which use recursion rather than the heap, and take more storage per
62 // backtrack point.
63 //
64 static const int32_t DEFAULT_BACKTRACK_STACK_CAPACITY = 8000000;
65
66 // Time limit counter constant.
67 // Time limits for expression evaluation are in terms of quanta of work by
68 // the engine, each of which is 10,000 state saves.
69 // This constant determines that state saves per tick number.
70 static const int32_t TIMER_INITIAL_VALUE = 10000;
71
72 //-----------------------------------------------------------------------------
73 //
74 // Constructor and Destructor
75 //
76 //-----------------------------------------------------------------------------
RegexMatcher(const RegexPattern * pat)77 RegexMatcher::RegexMatcher(const RegexPattern *pat) {
78 fDeferredStatus = U_ZERO_ERROR;
79 init(fDeferredStatus);
80 if (U_FAILURE(fDeferredStatus)) {
81 return;
82 }
83 if (pat==NULL) {
84 fDeferredStatus = U_ILLEGAL_ARGUMENT_ERROR;
85 return;
86 }
87 fPattern = pat;
88 init2(RegexStaticSets::gStaticSets->fEmptyText, fDeferredStatus);
89 }
90
91
92
RegexMatcher(const UnicodeString & regexp,const UnicodeString & input,uint32_t flags,UErrorCode & status)93 RegexMatcher::RegexMatcher(const UnicodeString ®exp, const UnicodeString &input,
94 uint32_t flags, UErrorCode &status) {
95 init(status);
96 if (U_FAILURE(status)) {
97 return;
98 }
99 UParseError pe;
100 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status);
101 fPattern = fPatternOwned;
102
103 UText inputText = UTEXT_INITIALIZER;
104 utext_openConstUnicodeString(&inputText, &input, &status);
105 init2(&inputText, status);
106 utext_close(&inputText);
107
108 fInputUniStrMaybeMutable = TRUE;
109 }
110
111
RegexMatcher(UText * regexp,UText * input,uint32_t flags,UErrorCode & status)112 RegexMatcher::RegexMatcher(UText *regexp, UText *input,
113 uint32_t flags, UErrorCode &status) {
114 init(status);
115 if (U_FAILURE(status)) {
116 return;
117 }
118 UParseError pe;
119 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status);
120 if (U_FAILURE(status)) {
121 return;
122 }
123
124 fPattern = fPatternOwned;
125 init2(input, status);
126 }
127
128
RegexMatcher(const UnicodeString & regexp,uint32_t flags,UErrorCode & status)129 RegexMatcher::RegexMatcher(const UnicodeString ®exp,
130 uint32_t flags, UErrorCode &status) {
131 init(status);
132 if (U_FAILURE(status)) {
133 return;
134 }
135 UParseError pe;
136 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status);
137 if (U_FAILURE(status)) {
138 return;
139 }
140 fPattern = fPatternOwned;
141 init2(RegexStaticSets::gStaticSets->fEmptyText, status);
142 }
143
RegexMatcher(UText * regexp,uint32_t flags,UErrorCode & status)144 RegexMatcher::RegexMatcher(UText *regexp,
145 uint32_t flags, UErrorCode &status) {
146 init(status);
147 if (U_FAILURE(status)) {
148 return;
149 }
150 UParseError pe;
151 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status);
152 if (U_FAILURE(status)) {
153 return;
154 }
155
156 fPattern = fPatternOwned;
157 init2(RegexStaticSets::gStaticSets->fEmptyText, status);
158 }
159
160
161
162
~RegexMatcher()163 RegexMatcher::~RegexMatcher() {
164 delete fStack;
165 if (fData != fSmallData) {
166 uprv_free(fData);
167 fData = NULL;
168 }
169 if (fPatternOwned) {
170 delete fPatternOwned;
171 fPatternOwned = NULL;
172 fPattern = NULL;
173 }
174
175 if (fInput) {
176 delete fInput;
177 }
178 if (fInputText) {
179 utext_close(fInputText);
180 }
181 if (fAltInputText) {
182 utext_close(fAltInputText);
183 }
184
185 #if UCONFIG_NO_BREAK_ITERATION==0
186 delete fWordBreakItr;
187 #endif
188 }
189
190 //
191 // init() common initialization for use by all constructors.
192 // Initialize all fields, get the object into a consistent state.
193 // This must be done even when the initial status shows an error,
194 // so that the object is initialized sufficiently well for the destructor
195 // to run safely.
196 //
init(UErrorCode & status)197 void RegexMatcher::init(UErrorCode &status) {
198 fPattern = NULL;
199 fPatternOwned = NULL;
200 fFrameSize = 0;
201 fRegionStart = 0;
202 fRegionLimit = 0;
203 fAnchorStart = 0;
204 fAnchorLimit = 0;
205 fLookStart = 0;
206 fLookLimit = 0;
207 fActiveStart = 0;
208 fActiveLimit = 0;
209 fTransparentBounds = FALSE;
210 fAnchoringBounds = TRUE;
211 fMatch = FALSE;
212 fMatchStart = 0;
213 fMatchEnd = 0;
214 fLastMatchEnd = -1;
215 fAppendPosition = 0;
216 fHitEnd = FALSE;
217 fRequireEnd = FALSE;
218 fStack = NULL;
219 fFrame = NULL;
220 fTimeLimit = 0;
221 fTime = 0;
222 fTickCounter = 0;
223 fStackLimit = DEFAULT_BACKTRACK_STACK_CAPACITY;
224 fCallbackFn = NULL;
225 fCallbackContext = NULL;
226 fFindProgressCallbackFn = NULL;
227 fFindProgressCallbackContext = NULL;
228 fTraceDebug = FALSE;
229 fDeferredStatus = status;
230 fData = fSmallData;
231 fWordBreakItr = NULL;
232
233 fStack = new UVector64(status);
234 fInputText = NULL;
235 fAltInputText = NULL;
236 fInput = NULL;
237 fInputLength = 0;
238 fInputUniStrMaybeMutable = FALSE;
239
240 if (U_FAILURE(status)) {
241 fDeferredStatus = status;
242 }
243 }
244
245 //
246 // init2() Common initialization for use by RegexMatcher constructors, part 2.
247 // This handles the common setup to be done after the Pattern is available.
248 //
init2(UText * input,UErrorCode & status)249 void RegexMatcher::init2(UText *input, UErrorCode &status) {
250 if (U_FAILURE(status)) {
251 fDeferredStatus = status;
252 return;
253 }
254
255 if (fPattern->fDataSize > (int32_t)(sizeof(fSmallData)/sizeof(fSmallData[0]))) {
256 fData = (int64_t *)uprv_malloc(fPattern->fDataSize * sizeof(int64_t));
257 if (fData == NULL) {
258 status = fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
259 return;
260 }
261 }
262
263 reset(input);
264 setStackLimit(DEFAULT_BACKTRACK_STACK_CAPACITY, status);
265 if (U_FAILURE(status)) {
266 fDeferredStatus = status;
267 return;
268 }
269 }
270
271
272 static const UChar BACKSLASH = 0x5c;
273 static const UChar DOLLARSIGN = 0x24;
274 //--------------------------------------------------------------------------------
275 //
276 // appendReplacement
277 //
278 //--------------------------------------------------------------------------------
appendReplacement(UnicodeString & dest,const UnicodeString & replacement,UErrorCode & status)279 RegexMatcher &RegexMatcher::appendReplacement(UnicodeString &dest,
280 const UnicodeString &replacement,
281 UErrorCode &status) {
282 UText replacementText = UTEXT_INITIALIZER;
283
284 utext_openConstUnicodeString(&replacementText, &replacement, &status);
285 if (U_SUCCESS(status)) {
286 UText resultText = UTEXT_INITIALIZER;
287 utext_openUnicodeString(&resultText, &dest, &status);
288
289 if (U_SUCCESS(status)) {
290 appendReplacement(&resultText, &replacementText, status);
291 utext_close(&resultText);
292 }
293 utext_close(&replacementText);
294 }
295
296 return *this;
297 }
298
299 //
300 // appendReplacement, UText mode
301 //
appendReplacement(UText * dest,UText * replacement,UErrorCode & status)302 RegexMatcher &RegexMatcher::appendReplacement(UText *dest,
303 UText *replacement,
304 UErrorCode &status) {
305 if (U_FAILURE(status)) {
306 return *this;
307 }
308 if (U_FAILURE(fDeferredStatus)) {
309 status = fDeferredStatus;
310 return *this;
311 }
312 if (fMatch == FALSE) {
313 status = U_REGEX_INVALID_STATE;
314 return *this;
315 }
316
317 // Copy input string from the end of previous match to start of current match
318 int64_t destLen = utext_nativeLength(dest);
319 if (fMatchStart > fAppendPosition) {
320 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
321 destLen += utext_replace(dest, destLen, destLen, fInputText->chunkContents+fAppendPosition,
322 (int32_t)(fMatchStart-fAppendPosition), &status);
323 } else {
324 int32_t len16;
325 if (UTEXT_USES_U16(fInputText)) {
326 len16 = (int32_t)(fMatchStart-fAppendPosition);
327 } else {
328 UErrorCode lengthStatus = U_ZERO_ERROR;
329 len16 = utext_extract(fInputText, fAppendPosition, fMatchStart, NULL, 0, &lengthStatus);
330 }
331 UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1));
332 if (inputChars == NULL) {
333 status = U_MEMORY_ALLOCATION_ERROR;
334 return *this;
335 }
336 utext_extract(fInputText, fAppendPosition, fMatchStart, inputChars, len16+1, &status);
337 destLen += utext_replace(dest, destLen, destLen, inputChars, len16, &status);
338 uprv_free(inputChars);
339 }
340 }
341 fAppendPosition = fMatchEnd;
342
343
344 // scan the replacement text, looking for substitutions ($n) and \escapes.
345 // TODO: optimize this loop by efficiently scanning for '$' or '\',
346 // move entire ranges not containing substitutions.
347 UTEXT_SETNATIVEINDEX(replacement, 0);
348 UChar32 c = UTEXT_NEXT32(replacement);
349 while (c != U_SENTINEL) {
350 if (c == BACKSLASH) {
351 // Backslash Escape. Copy the following char out without further checks.
352 // Note: Surrogate pairs don't need any special handling
353 // The second half wont be a '$' or a '\', and
354 // will move to the dest normally on the next
355 // loop iteration.
356 c = UTEXT_CURRENT32(replacement);
357 if (c == U_SENTINEL) {
358 break;
359 }
360
361 if (c==0x55/*U*/ || c==0x75/*u*/) {
362 // We have a \udddd or \Udddddddd escape sequence.
363 int32_t offset = 0;
364 struct URegexUTextUnescapeCharContext context = U_REGEX_UTEXT_UNESCAPE_CONTEXT(replacement);
365 UChar32 escapedChar = u_unescapeAt(uregex_utext_unescape_charAt, &offset, INT32_MAX, &context);
366 if (escapedChar != (UChar32)0xFFFFFFFF) {
367 if (U_IS_BMP(escapedChar)) {
368 UChar c16 = (UChar)escapedChar;
369 destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
370 } else {
371 UChar surrogate[2];
372 surrogate[0] = U16_LEAD(escapedChar);
373 surrogate[1] = U16_TRAIL(escapedChar);
374 if (U_SUCCESS(status)) {
375 destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);
376 }
377 }
378 // TODO: Report errors for mal-formed \u escapes?
379 // As this is, the original sequence is output, which may be OK.
380 if (context.lastOffset == offset) {
381 UTEXT_PREVIOUS32(replacement);
382 } else if (context.lastOffset != offset-1) {
383 utext_moveIndex32(replacement, offset - context.lastOffset - 1);
384 }
385 }
386 } else {
387 UTEXT_NEXT32(replacement);
388 // Plain backslash escape. Just put out the escaped character.
389 if (U_IS_BMP(c)) {
390 UChar c16 = (UChar)c;
391 destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
392 } else {
393 UChar surrogate[2];
394 surrogate[0] = U16_LEAD(c);
395 surrogate[1] = U16_TRAIL(c);
396 if (U_SUCCESS(status)) {
397 destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);
398 }
399 }
400 }
401 } else if (c != DOLLARSIGN) {
402 // Normal char, not a $. Copy it out without further checks.
403 if (U_IS_BMP(c)) {
404 UChar c16 = (UChar)c;
405 destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
406 } else {
407 UChar surrogate[2];
408 surrogate[0] = U16_LEAD(c);
409 surrogate[1] = U16_TRAIL(c);
410 if (U_SUCCESS(status)) {
411 destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);
412 }
413 }
414 } else {
415 // We've got a $. Pick up a capture group number if one follows.
416 // Consume at most the number of digits necessary for the largest capture
417 // number that is valid for this pattern.
418
419 int32_t numDigits = 0;
420 int32_t groupNum = 0;
421 UChar32 digitC;
422 for (;;) {
423 digitC = UTEXT_CURRENT32(replacement);
424 if (digitC == U_SENTINEL) {
425 break;
426 }
427 if (u_isdigit(digitC) == FALSE) {
428 break;
429 }
430 UTEXT_NEXT32(replacement);
431 groupNum=groupNum*10 + u_charDigitValue(digitC);
432 numDigits++;
433 if (numDigits >= fPattern->fMaxCaptureDigits) {
434 break;
435 }
436 }
437
438
439 if (numDigits == 0) {
440 // The $ didn't introduce a group number at all.
441 // Treat it as just part of the substitution text.
442 UChar c16 = DOLLARSIGN;
443 destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
444 } else {
445 // Finally, append the capture group data to the destination.
446 destLen += appendGroup(groupNum, dest, status);
447 if (U_FAILURE(status)) {
448 // Can fail if group number is out of range.
449 break;
450 }
451 }
452 }
453
454 if (U_FAILURE(status)) {
455 break;
456 } else {
457 c = UTEXT_NEXT32(replacement);
458 }
459 }
460
461 return *this;
462 }
463
464
465
466 //--------------------------------------------------------------------------------
467 //
468 // appendTail Intended to be used in conjunction with appendReplacement()
469 // To the destination string, append everything following
470 // the last match position from the input string.
471 //
472 // Note: Match ranges do not affect appendTail or appendReplacement
473 //
474 //--------------------------------------------------------------------------------
appendTail(UnicodeString & dest)475 UnicodeString &RegexMatcher::appendTail(UnicodeString &dest) {
476 UErrorCode status = U_ZERO_ERROR;
477 UText resultText = UTEXT_INITIALIZER;
478 utext_openUnicodeString(&resultText, &dest, &status);
479
480 if (U_SUCCESS(status)) {
481 appendTail(&resultText, status);
482 utext_close(&resultText);
483 }
484
485 return dest;
486 }
487
488 //
489 // appendTail, UText mode
490 //
appendTail(UText * dest,UErrorCode & status)491 UText *RegexMatcher::appendTail(UText *dest, UErrorCode &status) {
492 UBool bailOut = FALSE;
493 if (U_FAILURE(status)) {
494 bailOut = TRUE;
495 }
496 if (U_FAILURE(fDeferredStatus)) {
497 status = fDeferredStatus;
498 bailOut = TRUE;
499 }
500
501 if (bailOut) {
502 // dest must not be NULL
503 if (dest) {
504 utext_replace(dest, utext_nativeLength(dest), utext_nativeLength(dest), NULL, 0, &status);
505 return dest;
506 }
507 }
508
509 if (fInputLength > fAppendPosition) {
510 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
511 int64_t destLen = utext_nativeLength(dest);
512 utext_replace(dest, destLen, destLen, fInputText->chunkContents+fAppendPosition,
513 (int32_t)(fInputLength-fAppendPosition), &status);
514 } else {
515 int32_t len16;
516 if (UTEXT_USES_U16(fInputText)) {
517 len16 = (int32_t)(fInputLength-fAppendPosition);
518 } else {
519 len16 = utext_extract(fInputText, fAppendPosition, fInputLength, NULL, 0, &status);
520 status = U_ZERO_ERROR; // buffer overflow
521 }
522
523 UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16));
524 if (inputChars == NULL) {
525 fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
526 } else {
527 utext_extract(fInputText, fAppendPosition, fInputLength, inputChars, len16, &status); // unterminated
528 int64_t destLen = utext_nativeLength(dest);
529 utext_replace(dest, destLen, destLen, inputChars, len16, &status);
530 uprv_free(inputChars);
531 }
532 }
533 }
534 return dest;
535 }
536
537
538
539 //--------------------------------------------------------------------------------
540 //
541 // end
542 //
543 //--------------------------------------------------------------------------------
end(UErrorCode & err) const544 int32_t RegexMatcher::end(UErrorCode &err) const {
545 return end(0, err);
546 }
547
end64(UErrorCode & err) const548 int64_t RegexMatcher::end64(UErrorCode &err) const {
549 return end64(0, err);
550 }
551
end64(int32_t group,UErrorCode & err) const552 int64_t RegexMatcher::end64(int32_t group, UErrorCode &err) const {
553 if (U_FAILURE(err)) {
554 return -1;
555 }
556 if (fMatch == FALSE) {
557 err = U_REGEX_INVALID_STATE;
558 return -1;
559 }
560 if (group < 0 || group > fPattern->fGroupMap->size()) {
561 err = U_INDEX_OUTOFBOUNDS_ERROR;
562 return -1;
563 }
564 int64_t e = -1;
565 if (group == 0) {
566 e = fMatchEnd;
567 } else {
568 // Get the position within the stack frame of the variables for
569 // this capture group.
570 int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1);
571 U_ASSERT(groupOffset < fPattern->fFrameSize);
572 U_ASSERT(groupOffset >= 0);
573 e = fFrame->fExtra[groupOffset + 1];
574 }
575
576 return e;
577 }
578
end(int32_t group,UErrorCode & err) const579 int32_t RegexMatcher::end(int32_t group, UErrorCode &err) const {
580 return (int32_t)end64(group, err);
581 }
582
583
584 //--------------------------------------------------------------------------------
585 //
586 // find()
587 //
588 //--------------------------------------------------------------------------------
find()589 UBool RegexMatcher::find() {
590 // Start at the position of the last match end. (Will be zero if the
591 // matcher has been reset.)
592 //
593 if (U_FAILURE(fDeferredStatus)) {
594 return FALSE;
595 }
596
597 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
598 return findUsingChunk();
599 }
600
601 int64_t startPos = fMatchEnd;
602 if (startPos==0) {
603 startPos = fActiveStart;
604 }
605
606 if (fMatch) {
607 // Save the position of any previous successful match.
608 fLastMatchEnd = fMatchEnd;
609
610 if (fMatchStart == fMatchEnd) {
611 // Previous match had zero length. Move start position up one position
612 // to avoid sending find() into a loop on zero-length matches.
613 if (startPos >= fActiveLimit) {
614 fMatch = FALSE;
615 fHitEnd = TRUE;
616 return FALSE;
617 }
618 UTEXT_SETNATIVEINDEX(fInputText, startPos);
619 UTEXT_NEXT32(fInputText);
620 startPos = UTEXT_GETNATIVEINDEX(fInputText);
621 }
622 } else {
623 if (fLastMatchEnd >= 0) {
624 // A previous find() failed to match. Don't try again.
625 // (without this test, a pattern with a zero-length match
626 // could match again at the end of an input string.)
627 fHitEnd = TRUE;
628 return FALSE;
629 }
630 }
631
632
633 // Compute the position in the input string beyond which a match can not begin, because
634 // the minimum length match would extend past the end of the input.
635 // Note: some patterns that cannot match anything will have fMinMatchLength==Max Int.
636 // Be aware of possible overflows if making changes here.
637 int64_t testStartLimit;
638 if (UTEXT_USES_U16(fInputText)) {
639 testStartLimit = fActiveLimit - fPattern->fMinMatchLen;
640 if (startPos > testStartLimit) {
641 fMatch = FALSE;
642 fHitEnd = TRUE;
643 return FALSE;
644 }
645 } else {
646 // For now, let the matcher discover that it can't match on its own
647 // We don't know how long the match len is in native characters
648 testStartLimit = fActiveLimit;
649 }
650
651 UChar32 c;
652 U_ASSERT(startPos >= 0);
653
654 switch (fPattern->fStartType) {
655 case START_NO_INFO:
656 // No optimization was found.
657 // Try a match at each input position.
658 for (;;) {
659 MatchAt(startPos, FALSE, fDeferredStatus);
660 if (U_FAILURE(fDeferredStatus)) {
661 return FALSE;
662 }
663 if (fMatch) {
664 return TRUE;
665 }
666 if (startPos >= testStartLimit) {
667 fHitEnd = TRUE;
668 return FALSE;
669 }
670 UTEXT_SETNATIVEINDEX(fInputText, startPos);
671 UTEXT_NEXT32(fInputText);
672 startPos = UTEXT_GETNATIVEINDEX(fInputText);
673 // Note that it's perfectly OK for a pattern to have a zero-length
674 // match at the end of a string, so we must make sure that the loop
675 // runs with startPos == testStartLimit the last time through.
676 if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
677 return FALSE;
678 }
679 U_ASSERT(FALSE);
680
681 case START_START:
682 // Matches are only possible at the start of the input string
683 // (pattern begins with ^ or \A)
684 if (startPos > fActiveStart) {
685 fMatch = FALSE;
686 return FALSE;
687 }
688 MatchAt(startPos, FALSE, fDeferredStatus);
689 if (U_FAILURE(fDeferredStatus)) {
690 return FALSE;
691 }
692 return fMatch;
693
694
695 case START_SET:
696 {
697 // Match may start on any char from a pre-computed set.
698 U_ASSERT(fPattern->fMinMatchLen > 0);
699 int64_t pos;
700 UTEXT_SETNATIVEINDEX(fInputText, startPos);
701 for (;;) {
702 c = UTEXT_NEXT32(fInputText);
703 pos = UTEXT_GETNATIVEINDEX(fInputText);
704 // c will be -1 (U_SENTINEL) at end of text, in which case we
705 // skip this next block (so we don't have a negative array index)
706 // and handle end of text in the following block.
707 if (c >= 0 && ((c<256 && fPattern->fInitialChars8->contains(c)) ||
708 (c>=256 && fPattern->fInitialChars->contains(c)))) {
709 MatchAt(startPos, FALSE, fDeferredStatus);
710 if (U_FAILURE(fDeferredStatus)) {
711 return FALSE;
712 }
713 if (fMatch) {
714 return TRUE;
715 }
716 UTEXT_SETNATIVEINDEX(fInputText, pos);
717 }
718 if (startPos >= testStartLimit) {
719 fMatch = FALSE;
720 fHitEnd = TRUE;
721 return FALSE;
722 }
723 startPos = pos;
724 if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
725 return FALSE;
726 }
727 }
728 U_ASSERT(FALSE);
729
730 case START_STRING:
731 case START_CHAR:
732 {
733 // Match starts on exactly one char.
734 U_ASSERT(fPattern->fMinMatchLen > 0);
735 UChar32 theChar = fPattern->fInitialChar;
736 int64_t pos;
737 UTEXT_SETNATIVEINDEX(fInputText, startPos);
738 for (;;) {
739 c = UTEXT_NEXT32(fInputText);
740 pos = UTEXT_GETNATIVEINDEX(fInputText);
741 if (c == theChar) {
742 MatchAt(startPos, FALSE, fDeferredStatus);
743 if (U_FAILURE(fDeferredStatus)) {
744 return FALSE;
745 }
746 if (fMatch) {
747 return TRUE;
748 }
749 UTEXT_SETNATIVEINDEX(fInputText, pos);
750 }
751 if (startPos >= testStartLimit) {
752 fMatch = FALSE;
753 fHitEnd = TRUE;
754 return FALSE;
755 }
756 startPos = pos;
757 if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
758 return FALSE;
759 }
760 }
761 U_ASSERT(FALSE);
762
763 case START_LINE:
764 {
765 UChar32 c;
766 if (startPos == fAnchorStart) {
767 MatchAt(startPos, FALSE, fDeferredStatus);
768 if (U_FAILURE(fDeferredStatus)) {
769 return FALSE;
770 }
771 if (fMatch) {
772 return TRUE;
773 }
774 UTEXT_SETNATIVEINDEX(fInputText, startPos);
775 c = UTEXT_NEXT32(fInputText);
776 startPos = UTEXT_GETNATIVEINDEX(fInputText);
777 } else {
778 UTEXT_SETNATIVEINDEX(fInputText, startPos);
779 c = UTEXT_PREVIOUS32(fInputText);
780 UTEXT_SETNATIVEINDEX(fInputText, startPos);
781 }
782
783 if (fPattern->fFlags & UREGEX_UNIX_LINES) {
784 for (;;) {
785 if (c == 0x0a) {
786 MatchAt(startPos, FALSE, fDeferredStatus);
787 if (U_FAILURE(fDeferredStatus)) {
788 return FALSE;
789 }
790 if (fMatch) {
791 return TRUE;
792 }
793 UTEXT_SETNATIVEINDEX(fInputText, startPos);
794 }
795 if (startPos >= testStartLimit) {
796 fMatch = FALSE;
797 fHitEnd = TRUE;
798 return FALSE;
799 }
800 c = UTEXT_NEXT32(fInputText);
801 startPos = UTEXT_GETNATIVEINDEX(fInputText);
802 // Note that it's perfectly OK for a pattern to have a zero-length
803 // match at the end of a string, so we must make sure that the loop
804 // runs with startPos == testStartLimit the last time through.
805 if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
806 return FALSE;
807 }
808 } else {
809 for (;;) {
810 if (((c & 0x7f) <= 0x29) && // First quickly bypass as many chars as possible
811 ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029 )) {
812 if (c == 0x0d && startPos < fActiveLimit && UTEXT_CURRENT32(fInputText) == 0x0a) {
813 UTEXT_NEXT32(fInputText);
814 startPos = UTEXT_GETNATIVEINDEX(fInputText);
815 }
816 MatchAt(startPos, FALSE, fDeferredStatus);
817 if (U_FAILURE(fDeferredStatus)) {
818 return FALSE;
819 }
820 if (fMatch) {
821 return TRUE;
822 }
823 UTEXT_SETNATIVEINDEX(fInputText, startPos);
824 }
825 if (startPos >= testStartLimit) {
826 fMatch = FALSE;
827 fHitEnd = TRUE;
828 return FALSE;
829 }
830 c = UTEXT_NEXT32(fInputText);
831 startPos = UTEXT_GETNATIVEINDEX(fInputText);
832 // Note that it's perfectly OK for a pattern to have a zero-length
833 // match at the end of a string, so we must make sure that the loop
834 // runs with startPos == testStartLimit the last time through.
835 if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
836 return FALSE;
837 }
838 }
839 }
840
841 default:
842 U_ASSERT(FALSE);
843 }
844
845 U_ASSERT(FALSE);
846 return FALSE;
847 }
848
849
850
find(int64_t start,UErrorCode & status)851 UBool RegexMatcher::find(int64_t start, UErrorCode &status) {
852 if (U_FAILURE(status)) {
853 return FALSE;
854 }
855 if (U_FAILURE(fDeferredStatus)) {
856 status = fDeferredStatus;
857 return FALSE;
858 }
859 this->reset(); // Note: Reset() is specified by Java Matcher documentation.
860 // This will reset the region to be the full input length.
861 if (start < 0) {
862 status = U_INDEX_OUTOFBOUNDS_ERROR;
863 return FALSE;
864 }
865
866 int64_t nativeStart = start;
867 if (nativeStart < fActiveStart || nativeStart > fActiveLimit) {
868 status = U_INDEX_OUTOFBOUNDS_ERROR;
869 return FALSE;
870 }
871 fMatchEnd = nativeStart;
872 return find();
873 }
874
875
876 //--------------------------------------------------------------------------------
877 //
878 // findUsingChunk() -- like find(), but with the advance knowledge that the
879 // entire string is available in the UText's chunk buffer.
880 //
881 //--------------------------------------------------------------------------------
findUsingChunk()882 UBool RegexMatcher::findUsingChunk() {
883 // Start at the position of the last match end. (Will be zero if the
884 // matcher has been reset.
885 //
886
887 int32_t startPos = (int32_t)fMatchEnd;
888 if (startPos==0) {
889 startPos = (int32_t)fActiveStart;
890 }
891
892 const UChar *inputBuf = fInputText->chunkContents;
893
894 if (fMatch) {
895 // Save the position of any previous successful match.
896 fLastMatchEnd = fMatchEnd;
897
898 if (fMatchStart == fMatchEnd) {
899 // Previous match had zero length. Move start position up one position
900 // to avoid sending find() into a loop on zero-length matches.
901 if (startPos >= fActiveLimit) {
902 fMatch = FALSE;
903 fHitEnd = TRUE;
904 return FALSE;
905 }
906 U16_FWD_1(inputBuf, startPos, fInputLength);
907 }
908 } else {
909 if (fLastMatchEnd >= 0) {
910 // A previous find() failed to match. Don't try again.
911 // (without this test, a pattern with a zero-length match
912 // could match again at the end of an input string.)
913 fHitEnd = TRUE;
914 return FALSE;
915 }
916 }
917
918
919 // Compute the position in the input string beyond which a match can not begin, because
920 // the minimum length match would extend past the end of the input.
921 // Note: some patterns that cannot match anything will have fMinMatchLength==Max Int.
922 // Be aware of possible overflows if making changes here.
923 int32_t testLen = (int32_t)(fActiveLimit - fPattern->fMinMatchLen);
924 if (startPos > testLen) {
925 fMatch = FALSE;
926 fHitEnd = TRUE;
927 return FALSE;
928 }
929
930 UChar32 c;
931 U_ASSERT(startPos >= 0);
932
933 switch (fPattern->fStartType) {
934 case START_NO_INFO:
935 // No optimization was found.
936 // Try a match at each input position.
937 for (;;) {
938 MatchChunkAt(startPos, FALSE, fDeferredStatus);
939 if (U_FAILURE(fDeferredStatus)) {
940 return FALSE;
941 }
942 if (fMatch) {
943 return TRUE;
944 }
945 if (startPos >= testLen) {
946 fHitEnd = TRUE;
947 return FALSE;
948 }
949 U16_FWD_1(inputBuf, startPos, fActiveLimit);
950 // Note that it's perfectly OK for a pattern to have a zero-length
951 // match at the end of a string, so we must make sure that the loop
952 // runs with startPos == testLen the last time through.
953 if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
954 return FALSE;
955 }
956 U_ASSERT(FALSE);
957
958 case START_START:
959 // Matches are only possible at the start of the input string
960 // (pattern begins with ^ or \A)
961 if (startPos > fActiveStart) {
962 fMatch = FALSE;
963 return FALSE;
964 }
965 MatchChunkAt(startPos, FALSE, fDeferredStatus);
966 if (U_FAILURE(fDeferredStatus)) {
967 return FALSE;
968 }
969 return fMatch;
970
971
972 case START_SET:
973 {
974 // Match may start on any char from a pre-computed set.
975 U_ASSERT(fPattern->fMinMatchLen > 0);
976 for (;;) {
977 int32_t pos = startPos;
978 U16_NEXT(inputBuf, startPos, fActiveLimit, c); // like c = inputBuf[startPos++];
979 if ((c<256 && fPattern->fInitialChars8->contains(c)) ||
980 (c>=256 && fPattern->fInitialChars->contains(c))) {
981 MatchChunkAt(pos, FALSE, fDeferredStatus);
982 if (U_FAILURE(fDeferredStatus)) {
983 return FALSE;
984 }
985 if (fMatch) {
986 return TRUE;
987 }
988 }
989 if (pos >= testLen) {
990 fMatch = FALSE;
991 fHitEnd = TRUE;
992 return FALSE;
993 }
994 if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
995 return FALSE;
996 }
997 }
998 U_ASSERT(FALSE);
999
1000 case START_STRING:
1001 case START_CHAR:
1002 {
1003 // Match starts on exactly one char.
1004 U_ASSERT(fPattern->fMinMatchLen > 0);
1005 UChar32 theChar = fPattern->fInitialChar;
1006 for (;;) {
1007 int32_t pos = startPos;
1008 U16_NEXT(inputBuf, startPos, fActiveLimit, c); // like c = inputBuf[startPos++];
1009 if (c == theChar) {
1010 MatchChunkAt(pos, FALSE, fDeferredStatus);
1011 if (U_FAILURE(fDeferredStatus)) {
1012 return FALSE;
1013 }
1014 if (fMatch) {
1015 return TRUE;
1016 }
1017 }
1018 if (pos >= testLen) {
1019 fMatch = FALSE;
1020 fHitEnd = TRUE;
1021 return FALSE;
1022 }
1023 if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
1024 return FALSE;
1025 }
1026 }
1027 U_ASSERT(FALSE);
1028
1029 case START_LINE:
1030 {
1031 UChar32 c;
1032 if (startPos == fAnchorStart) {
1033 MatchChunkAt(startPos, FALSE, fDeferredStatus);
1034 if (U_FAILURE(fDeferredStatus)) {
1035 return FALSE;
1036 }
1037 if (fMatch) {
1038 return TRUE;
1039 }
1040 U16_FWD_1(inputBuf, startPos, fActiveLimit);
1041 }
1042
1043 if (fPattern->fFlags & UREGEX_UNIX_LINES) {
1044 for (;;) {
1045 c = inputBuf[startPos-1];
1046 if (c == 0x0a) {
1047 MatchChunkAt(startPos, FALSE, fDeferredStatus);
1048 if (U_FAILURE(fDeferredStatus)) {
1049 return FALSE;
1050 }
1051 if (fMatch) {
1052 return TRUE;
1053 }
1054 }
1055 if (startPos >= testLen) {
1056 fMatch = FALSE;
1057 fHitEnd = TRUE;
1058 return FALSE;
1059 }
1060 U16_FWD_1(inputBuf, startPos, fActiveLimit);
1061 // Note that it's perfectly OK for a pattern to have a zero-length
1062 // match at the end of a string, so we must make sure that the loop
1063 // runs with startPos == testLen the last time through.
1064 if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
1065 return FALSE;
1066 }
1067 } else {
1068 for (;;) {
1069 c = inputBuf[startPos-1];
1070 if (((c & 0x7f) <= 0x29) && // First quickly bypass as many chars as possible
1071 ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029 )) {
1072 if (c == 0x0d && startPos < fActiveLimit && inputBuf[startPos] == 0x0a) {
1073 startPos++;
1074 }
1075 MatchChunkAt(startPos, FALSE, fDeferredStatus);
1076 if (U_FAILURE(fDeferredStatus)) {
1077 return FALSE;
1078 }
1079 if (fMatch) {
1080 return TRUE;
1081 }
1082 }
1083 if (startPos >= testLen) {
1084 fMatch = FALSE;
1085 fHitEnd = TRUE;
1086 return FALSE;
1087 }
1088 U16_FWD_1(inputBuf, startPos, fActiveLimit);
1089 // Note that it's perfectly OK for a pattern to have a zero-length
1090 // match at the end of a string, so we must make sure that the loop
1091 // runs with startPos == testLen the last time through.
1092 if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
1093 return FALSE;
1094 }
1095 }
1096 }
1097
1098 default:
1099 U_ASSERT(FALSE);
1100 }
1101
1102 U_ASSERT(FALSE);
1103 return FALSE;
1104 }
1105
1106
1107
1108 //--------------------------------------------------------------------------------
1109 //
1110 // group()
1111 //
1112 //--------------------------------------------------------------------------------
group(UErrorCode & status) const1113 UnicodeString RegexMatcher::group(UErrorCode &status) const {
1114 return group(0, status);
1115 }
1116
1117 // Return immutable shallow clone
group(UText * dest,int64_t & group_len,UErrorCode & status) const1118 UText *RegexMatcher::group(UText *dest, int64_t &group_len, UErrorCode &status) const {
1119 return group(0, dest, group_len, status);
1120 }
1121
1122 // Return immutable shallow clone
group(int32_t groupNum,UText * dest,int64_t & group_len,UErrorCode & status) const1123 UText *RegexMatcher::group(int32_t groupNum, UText *dest, int64_t &group_len, UErrorCode &status) const {
1124 group_len = 0;
1125 UBool bailOut = FALSE;
1126 if (U_FAILURE(status)) {
1127 return dest;
1128 }
1129 if (U_FAILURE(fDeferredStatus)) {
1130 status = fDeferredStatus;
1131 bailOut = TRUE;
1132 }
1133 if (fMatch == FALSE) {
1134 status = U_REGEX_INVALID_STATE;
1135 bailOut = TRUE;
1136 }
1137 if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) {
1138 status = U_INDEX_OUTOFBOUNDS_ERROR;
1139 bailOut = TRUE;
1140 }
1141
1142 if (bailOut) {
1143 return (dest) ? dest : utext_openUChars(NULL, NULL, 0, &status);
1144 }
1145
1146 int64_t s, e;
1147 if (groupNum == 0) {
1148 s = fMatchStart;
1149 e = fMatchEnd;
1150 } else {
1151 int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1);
1152 U_ASSERT(groupOffset < fPattern->fFrameSize);
1153 U_ASSERT(groupOffset >= 0);
1154 s = fFrame->fExtra[groupOffset];
1155 e = fFrame->fExtra[groupOffset+1];
1156 }
1157
1158 if (s < 0) {
1159 // A capture group wasn't part of the match
1160 return utext_clone(dest, fInputText, FALSE, TRUE, &status);
1161 }
1162 U_ASSERT(s <= e);
1163 group_len = e - s;
1164
1165 dest = utext_clone(dest, fInputText, FALSE, TRUE, &status);
1166 if (dest)
1167 UTEXT_SETNATIVEINDEX(dest, s);
1168 return dest;
1169 }
1170
group(int32_t groupNum,UErrorCode & status) const1171 UnicodeString RegexMatcher::group(int32_t groupNum, UErrorCode &status) const {
1172 UnicodeString result;
1173 if (U_FAILURE(status)) {
1174 return result;
1175 }
1176 UText resultText = UTEXT_INITIALIZER;
1177 utext_openUnicodeString(&resultText, &result, &status);
1178 group(groupNum, &resultText, status);
1179 utext_close(&resultText);
1180 return result;
1181 }
1182
1183
1184 // Return deep (mutable) clone
1185 // Technology Preview (as an API), but note that the UnicodeString API is implemented
1186 // using this function.
group(int32_t groupNum,UText * dest,UErrorCode & status) const1187 UText *RegexMatcher::group(int32_t groupNum, UText *dest, UErrorCode &status) const {
1188 UBool bailOut = FALSE;
1189 if (U_FAILURE(status)) {
1190 return dest;
1191 }
1192 if (U_FAILURE(fDeferredStatus)) {
1193 status = fDeferredStatus;
1194 bailOut = TRUE;
1195 }
1196
1197 if (fMatch == FALSE) {
1198 status = U_REGEX_INVALID_STATE;
1199 bailOut = TRUE;
1200 }
1201 if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) {
1202 status = U_INDEX_OUTOFBOUNDS_ERROR;
1203 bailOut = TRUE;
1204 }
1205
1206 if (bailOut) {
1207 if (dest) {
1208 utext_replace(dest, 0, utext_nativeLength(dest), NULL, 0, &status);
1209 return dest;
1210 } else {
1211 return utext_openUChars(NULL, NULL, 0, &status);
1212 }
1213 }
1214
1215 int64_t s, e;
1216 if (groupNum == 0) {
1217 s = fMatchStart;
1218 e = fMatchEnd;
1219 } else {
1220 int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1);
1221 U_ASSERT(groupOffset < fPattern->fFrameSize);
1222 U_ASSERT(groupOffset >= 0);
1223 s = fFrame->fExtra[groupOffset];
1224 e = fFrame->fExtra[groupOffset+1];
1225 }
1226
1227 if (s < 0) {
1228 // A capture group wasn't part of the match
1229 if (dest) {
1230 utext_replace(dest, 0, utext_nativeLength(dest), NULL, 0, &status);
1231 return dest;
1232 } else {
1233 return utext_openUChars(NULL, NULL, 0, &status);
1234 }
1235 }
1236 U_ASSERT(s <= e);
1237
1238 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1239 U_ASSERT(e <= fInputLength);
1240 if (dest) {
1241 utext_replace(dest, 0, utext_nativeLength(dest), fInputText->chunkContents+s, (int32_t)(e-s), &status);
1242 } else {
1243 UText groupText = UTEXT_INITIALIZER;
1244 utext_openUChars(&groupText, fInputText->chunkContents+s, e-s, &status);
1245 dest = utext_clone(NULL, &groupText, TRUE, FALSE, &status);
1246 utext_close(&groupText);
1247 }
1248 } else {
1249 int32_t len16;
1250 if (UTEXT_USES_U16(fInputText)) {
1251 len16 = (int32_t)(e-s);
1252 } else {
1253 UErrorCode lengthStatus = U_ZERO_ERROR;
1254 len16 = utext_extract(fInputText, s, e, NULL, 0, &lengthStatus);
1255 }
1256 UChar *groupChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1));
1257 if (groupChars == NULL) {
1258 status = U_MEMORY_ALLOCATION_ERROR;
1259 return dest;
1260 }
1261 utext_extract(fInputText, s, e, groupChars, len16+1, &status);
1262
1263 if (dest) {
1264 utext_replace(dest, 0, utext_nativeLength(dest), groupChars, len16, &status);
1265 } else {
1266 UText groupText = UTEXT_INITIALIZER;
1267 utext_openUChars(&groupText, groupChars, len16, &status);
1268 dest = utext_clone(NULL, &groupText, TRUE, FALSE, &status);
1269 utext_close(&groupText);
1270 }
1271
1272 uprv_free(groupChars);
1273 }
1274 return dest;
1275 }
1276
1277 //--------------------------------------------------------------------------------
1278 //
1279 // appendGroup() -- currently internal only, appends a group to a UText rather
1280 // than replacing its contents
1281 //
1282 //--------------------------------------------------------------------------------
1283
appendGroup(int32_t groupNum,UText * dest,UErrorCode & status) const1284 int64_t RegexMatcher::appendGroup(int32_t groupNum, UText *dest, UErrorCode &status) const {
1285 if (U_FAILURE(status)) {
1286 return 0;
1287 }
1288 if (U_FAILURE(fDeferredStatus)) {
1289 status = fDeferredStatus;
1290 return 0;
1291 }
1292 int64_t destLen = utext_nativeLength(dest);
1293
1294 if (fMatch == FALSE) {
1295 status = U_REGEX_INVALID_STATE;
1296 return utext_replace(dest, destLen, destLen, NULL, 0, &status);
1297 }
1298 if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) {
1299 status = U_INDEX_OUTOFBOUNDS_ERROR;
1300 return utext_replace(dest, destLen, destLen, NULL, 0, &status);
1301 }
1302
1303 int64_t s, e;
1304 if (groupNum == 0) {
1305 s = fMatchStart;
1306 e = fMatchEnd;
1307 } else {
1308 int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1);
1309 U_ASSERT(groupOffset < fPattern->fFrameSize);
1310 U_ASSERT(groupOffset >= 0);
1311 s = fFrame->fExtra[groupOffset];
1312 e = fFrame->fExtra[groupOffset+1];
1313 }
1314
1315 if (s < 0) {
1316 // A capture group wasn't part of the match
1317 return utext_replace(dest, destLen, destLen, NULL, 0, &status);
1318 }
1319 U_ASSERT(s <= e);
1320
1321 int64_t deltaLen;
1322 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1323 U_ASSERT(e <= fInputLength);
1324 deltaLen = utext_replace(dest, destLen, destLen, fInputText->chunkContents+s, (int32_t)(e-s), &status);
1325 } else {
1326 int32_t len16;
1327 if (UTEXT_USES_U16(fInputText)) {
1328 len16 = (int32_t)(e-s);
1329 } else {
1330 UErrorCode lengthStatus = U_ZERO_ERROR;
1331 len16 = utext_extract(fInputText, s, e, NULL, 0, &lengthStatus);
1332 }
1333 UChar *groupChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1));
1334 if (groupChars == NULL) {
1335 status = U_MEMORY_ALLOCATION_ERROR;
1336 return 0;
1337 }
1338 utext_extract(fInputText, s, e, groupChars, len16+1, &status);
1339
1340 deltaLen = utext_replace(dest, destLen, destLen, groupChars, len16, &status);
1341 uprv_free(groupChars);
1342 }
1343 return deltaLen;
1344 }
1345
1346
1347
1348 //--------------------------------------------------------------------------------
1349 //
1350 // groupCount()
1351 //
1352 //--------------------------------------------------------------------------------
groupCount() const1353 int32_t RegexMatcher::groupCount() const {
1354 return fPattern->fGroupMap->size();
1355 }
1356
1357
1358
1359 //--------------------------------------------------------------------------------
1360 //
1361 // hasAnchoringBounds()
1362 //
1363 //--------------------------------------------------------------------------------
hasAnchoringBounds() const1364 UBool RegexMatcher::hasAnchoringBounds() const {
1365 return fAnchoringBounds;
1366 }
1367
1368
1369 //--------------------------------------------------------------------------------
1370 //
1371 // hasTransparentBounds()
1372 //
1373 //--------------------------------------------------------------------------------
hasTransparentBounds() const1374 UBool RegexMatcher::hasTransparentBounds() const {
1375 return fTransparentBounds;
1376 }
1377
1378
1379
1380 //--------------------------------------------------------------------------------
1381 //
1382 // hitEnd()
1383 //
1384 //--------------------------------------------------------------------------------
hitEnd() const1385 UBool RegexMatcher::hitEnd() const {
1386 return fHitEnd;
1387 }
1388
1389
1390 //--------------------------------------------------------------------------------
1391 //
1392 // input()
1393 //
1394 //--------------------------------------------------------------------------------
input() const1395 const UnicodeString &RegexMatcher::input() const {
1396 if (!fInput) {
1397 UErrorCode status = U_ZERO_ERROR;
1398 int32_t len16;
1399 if (UTEXT_USES_U16(fInputText)) {
1400 len16 = (int32_t)fInputLength;
1401 } else {
1402 len16 = utext_extract(fInputText, 0, fInputLength, NULL, 0, &status);
1403 status = U_ZERO_ERROR; // overflow, length status
1404 }
1405 UnicodeString *result = new UnicodeString(len16, 0, 0);
1406
1407 UChar *inputChars = result->getBuffer(len16);
1408 utext_extract(fInputText, 0, fInputLength, inputChars, len16, &status); // unterminated warning
1409 result->releaseBuffer(len16);
1410
1411 (*(const UnicodeString **)&fInput) = result; // pointer assignment, rather than operator=
1412 }
1413
1414 return *fInput;
1415 }
1416
1417 //--------------------------------------------------------------------------------
1418 //
1419 // inputText()
1420 //
1421 //--------------------------------------------------------------------------------
inputText() const1422 UText *RegexMatcher::inputText() const {
1423 return fInputText;
1424 }
1425
1426
1427 //--------------------------------------------------------------------------------
1428 //
1429 // getInput() -- like inputText(), but makes a clone or copies into another UText
1430 //
1431 //--------------------------------------------------------------------------------
getInput(UText * dest,UErrorCode & status) const1432 UText *RegexMatcher::getInput (UText *dest, UErrorCode &status) const {
1433 UBool bailOut = FALSE;
1434 if (U_FAILURE(status)) {
1435 return dest;
1436 }
1437 if (U_FAILURE(fDeferredStatus)) {
1438 status = fDeferredStatus;
1439 bailOut = TRUE;
1440 }
1441
1442 if (bailOut) {
1443 if (dest) {
1444 utext_replace(dest, 0, utext_nativeLength(dest), NULL, 0, &status);
1445 return dest;
1446 } else {
1447 return utext_clone(NULL, fInputText, FALSE, TRUE, &status);
1448 }
1449 }
1450
1451 if (dest) {
1452 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1453 utext_replace(dest, 0, utext_nativeLength(dest), fInputText->chunkContents, (int32_t)fInputLength, &status);
1454 } else {
1455 int32_t input16Len;
1456 if (UTEXT_USES_U16(fInputText)) {
1457 input16Len = (int32_t)fInputLength;
1458 } else {
1459 UErrorCode lengthStatus = U_ZERO_ERROR;
1460 input16Len = utext_extract(fInputText, 0, fInputLength, NULL, 0, &lengthStatus); // buffer overflow error
1461 }
1462 UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(input16Len));
1463 if (inputChars == NULL) {
1464 return dest;
1465 }
1466
1467 status = U_ZERO_ERROR;
1468 utext_extract(fInputText, 0, fInputLength, inputChars, input16Len, &status); // not terminated warning
1469 status = U_ZERO_ERROR;
1470 utext_replace(dest, 0, utext_nativeLength(dest), inputChars, input16Len, &status);
1471
1472 uprv_free(inputChars);
1473 }
1474 return dest;
1475 } else {
1476 return utext_clone(NULL, fInputText, FALSE, TRUE, &status);
1477 }
1478 }
1479
1480
1481 static UBool compat_SyncMutableUTextContents(UText *ut);
compat_SyncMutableUTextContents(UText * ut)1482 static UBool compat_SyncMutableUTextContents(UText *ut) {
1483 UBool retVal = FALSE;
1484
1485 // In the following test, we're really only interested in whether the UText should switch
1486 // between heap and stack allocation. If length hasn't changed, we won't, so the chunkContents
1487 // will still point to the correct data.
1488 if (utext_nativeLength(ut) != ut->nativeIndexingLimit) {
1489 UnicodeString *us=(UnicodeString *)ut->context;
1490
1491 // Update to the latest length.
1492 // For example, (utext_nativeLength(ut) != ut->nativeIndexingLimit).
1493 int32_t newLength = us->length();
1494
1495 // Update the chunk description.
1496 // The buffer may have switched between stack- and heap-based.
1497 ut->chunkContents = us->getBuffer();
1498 ut->chunkLength = newLength;
1499 ut->chunkNativeLimit = newLength;
1500 ut->nativeIndexingLimit = newLength;
1501 retVal = TRUE;
1502 }
1503
1504 return retVal;
1505 }
1506
1507 //--------------------------------------------------------------------------------
1508 //
1509 // lookingAt()
1510 //
1511 //--------------------------------------------------------------------------------
lookingAt(UErrorCode & status)1512 UBool RegexMatcher::lookingAt(UErrorCode &status) {
1513 if (U_FAILURE(status)) {
1514 return FALSE;
1515 }
1516 if (U_FAILURE(fDeferredStatus)) {
1517 status = fDeferredStatus;
1518 return FALSE;
1519 }
1520
1521 if (fInputUniStrMaybeMutable) {
1522 if (compat_SyncMutableUTextContents(fInputText)) {
1523 fInputLength = utext_nativeLength(fInputText);
1524 reset();
1525 }
1526 }
1527 else {
1528 resetPreserveRegion();
1529 }
1530 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1531 MatchChunkAt((int32_t)fActiveStart, FALSE, status);
1532 } else {
1533 MatchAt(fActiveStart, FALSE, status);
1534 }
1535 return fMatch;
1536 }
1537
1538
lookingAt(int64_t start,UErrorCode & status)1539 UBool RegexMatcher::lookingAt(int64_t start, UErrorCode &status) {
1540 if (U_FAILURE(status)) {
1541 return FALSE;
1542 }
1543 if (U_FAILURE(fDeferredStatus)) {
1544 status = fDeferredStatus;
1545 return FALSE;
1546 }
1547 reset();
1548
1549 if (start < 0) {
1550 status = U_INDEX_OUTOFBOUNDS_ERROR;
1551 return FALSE;
1552 }
1553
1554 if (fInputUniStrMaybeMutable) {
1555 if (compat_SyncMutableUTextContents(fInputText)) {
1556 fInputLength = utext_nativeLength(fInputText);
1557 reset();
1558 }
1559 }
1560
1561 int64_t nativeStart;
1562 nativeStart = start;
1563 if (nativeStart < fActiveStart || nativeStart > fActiveLimit) {
1564 status = U_INDEX_OUTOFBOUNDS_ERROR;
1565 return FALSE;
1566 }
1567
1568 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1569 MatchChunkAt((int32_t)nativeStart, FALSE, status);
1570 } else {
1571 MatchAt(nativeStart, FALSE, status);
1572 }
1573 return fMatch;
1574 }
1575
1576
1577
1578 //--------------------------------------------------------------------------------
1579 //
1580 // matches()
1581 //
1582 //--------------------------------------------------------------------------------
matches(UErrorCode & status)1583 UBool RegexMatcher::matches(UErrorCode &status) {
1584 if (U_FAILURE(status)) {
1585 return FALSE;
1586 }
1587 if (U_FAILURE(fDeferredStatus)) {
1588 status = fDeferredStatus;
1589 return FALSE;
1590 }
1591
1592 if (fInputUniStrMaybeMutable) {
1593 if (compat_SyncMutableUTextContents(fInputText)) {
1594 fInputLength = utext_nativeLength(fInputText);
1595 reset();
1596 }
1597 }
1598 else {
1599 resetPreserveRegion();
1600 }
1601
1602 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1603 MatchChunkAt((int32_t)fActiveStart, TRUE, status);
1604 } else {
1605 MatchAt(fActiveStart, TRUE, status);
1606 }
1607 return fMatch;
1608 }
1609
1610
matches(int64_t start,UErrorCode & status)1611 UBool RegexMatcher::matches(int64_t start, UErrorCode &status) {
1612 if (U_FAILURE(status)) {
1613 return FALSE;
1614 }
1615 if (U_FAILURE(fDeferredStatus)) {
1616 status = fDeferredStatus;
1617 return FALSE;
1618 }
1619 reset();
1620
1621 if (start < 0) {
1622 status = U_INDEX_OUTOFBOUNDS_ERROR;
1623 return FALSE;
1624 }
1625
1626 if (fInputUniStrMaybeMutable) {
1627 if (compat_SyncMutableUTextContents(fInputText)) {
1628 fInputLength = utext_nativeLength(fInputText);
1629 reset();
1630 }
1631 }
1632
1633 int64_t nativeStart;
1634 nativeStart = start;
1635 if (nativeStart < fActiveStart || nativeStart > fActiveLimit) {
1636 status = U_INDEX_OUTOFBOUNDS_ERROR;
1637 return FALSE;
1638 }
1639
1640 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1641 MatchChunkAt((int32_t)nativeStart, TRUE, status);
1642 } else {
1643 MatchAt(nativeStart, TRUE, status);
1644 }
1645 return fMatch;
1646 }
1647
1648
1649
1650 //--------------------------------------------------------------------------------
1651 //
1652 // pattern
1653 //
1654 //--------------------------------------------------------------------------------
pattern() const1655 const RegexPattern &RegexMatcher::pattern() const {
1656 return *fPattern;
1657 }
1658
1659
1660
1661 //--------------------------------------------------------------------------------
1662 //
1663 // region
1664 //
1665 //--------------------------------------------------------------------------------
region(int64_t regionStart,int64_t regionLimit,int64_t startIndex,UErrorCode & status)1666 RegexMatcher &RegexMatcher::region(int64_t regionStart, int64_t regionLimit, int64_t startIndex, UErrorCode &status) {
1667 if (U_FAILURE(status)) {
1668 return *this;
1669 }
1670
1671 if (regionStart>regionLimit || regionStart<0 || regionLimit<0) {
1672 status = U_ILLEGAL_ARGUMENT_ERROR;
1673 }
1674
1675 int64_t nativeStart = regionStart;
1676 int64_t nativeLimit = regionLimit;
1677 if (nativeStart > fInputLength || nativeLimit > fInputLength) {
1678 status = U_ILLEGAL_ARGUMENT_ERROR;
1679 }
1680
1681 if (startIndex == -1)
1682 this->reset();
1683 else
1684 resetPreserveRegion();
1685
1686 fRegionStart = nativeStart;
1687 fRegionLimit = nativeLimit;
1688 fActiveStart = nativeStart;
1689 fActiveLimit = nativeLimit;
1690
1691 if (startIndex != -1) {
1692 if (startIndex < fActiveStart || startIndex > fActiveLimit) {
1693 status = U_INDEX_OUTOFBOUNDS_ERROR;
1694 }
1695 fMatchEnd = startIndex;
1696 }
1697
1698 if (!fTransparentBounds) {
1699 fLookStart = nativeStart;
1700 fLookLimit = nativeLimit;
1701 }
1702 if (fAnchoringBounds) {
1703 fAnchorStart = nativeStart;
1704 fAnchorLimit = nativeLimit;
1705 }
1706 return *this;
1707 }
1708
region(int64_t start,int64_t limit,UErrorCode & status)1709 RegexMatcher &RegexMatcher::region(int64_t start, int64_t limit, UErrorCode &status) {
1710 return region(start, limit, -1, status);
1711 }
1712
1713 //--------------------------------------------------------------------------------
1714 //
1715 // regionEnd
1716 //
1717 //--------------------------------------------------------------------------------
regionEnd() const1718 int32_t RegexMatcher::regionEnd() const {
1719 return (int32_t)fRegionLimit;
1720 }
1721
regionEnd64() const1722 int64_t RegexMatcher::regionEnd64() const {
1723 return fRegionLimit;
1724 }
1725
1726 //--------------------------------------------------------------------------------
1727 //
1728 // regionStart
1729 //
1730 //--------------------------------------------------------------------------------
regionStart() const1731 int32_t RegexMatcher::regionStart() const {
1732 return (int32_t)fRegionStart;
1733 }
1734
regionStart64() const1735 int64_t RegexMatcher::regionStart64() const {
1736 return fRegionStart;
1737 }
1738
1739
1740 //--------------------------------------------------------------------------------
1741 //
1742 // replaceAll
1743 //
1744 //--------------------------------------------------------------------------------
replaceAll(const UnicodeString & replacement,UErrorCode & status)1745 UnicodeString RegexMatcher::replaceAll(const UnicodeString &replacement, UErrorCode &status) {
1746 UText replacementText = UTEXT_INITIALIZER;
1747 UText resultText = UTEXT_INITIALIZER;
1748 UnicodeString resultString;
1749 if (U_FAILURE(status)) {
1750 return resultString;
1751 }
1752
1753 utext_openConstUnicodeString(&replacementText, &replacement, &status);
1754 utext_openUnicodeString(&resultText, &resultString, &status);
1755
1756 replaceAll(&replacementText, &resultText, status);
1757
1758 utext_close(&resultText);
1759 utext_close(&replacementText);
1760
1761 return resultString;
1762 }
1763
1764
1765 //
1766 // replaceAll, UText mode
1767 //
replaceAll(UText * replacement,UText * dest,UErrorCode & status)1768 UText *RegexMatcher::replaceAll(UText *replacement, UText *dest, UErrorCode &status) {
1769 if (U_FAILURE(status)) {
1770 return dest;
1771 }
1772 if (U_FAILURE(fDeferredStatus)) {
1773 status = fDeferredStatus;
1774 return dest;
1775 }
1776
1777 if (dest == NULL) {
1778 UnicodeString emptyString;
1779 UText empty = UTEXT_INITIALIZER;
1780
1781 utext_openUnicodeString(&empty, &emptyString, &status);
1782 dest = utext_clone(NULL, &empty, TRUE, FALSE, &status);
1783 utext_close(&empty);
1784 }
1785
1786 if (U_SUCCESS(status)) {
1787 reset();
1788 while (find()) {
1789 appendReplacement(dest, replacement, status);
1790 if (U_FAILURE(status)) {
1791 break;
1792 }
1793 }
1794 appendTail(dest, status);
1795 }
1796
1797 return dest;
1798 }
1799
1800
1801 //--------------------------------------------------------------------------------
1802 //
1803 // replaceFirst
1804 //
1805 //--------------------------------------------------------------------------------
replaceFirst(const UnicodeString & replacement,UErrorCode & status)1806 UnicodeString RegexMatcher::replaceFirst(const UnicodeString &replacement, UErrorCode &status) {
1807 UText replacementText = UTEXT_INITIALIZER;
1808 UText resultText = UTEXT_INITIALIZER;
1809 UnicodeString resultString;
1810
1811 utext_openConstUnicodeString(&replacementText, &replacement, &status);
1812 utext_openUnicodeString(&resultText, &resultString, &status);
1813
1814 replaceFirst(&replacementText, &resultText, status);
1815
1816 utext_close(&resultText);
1817 utext_close(&replacementText);
1818
1819 return resultString;
1820 }
1821
1822 //
1823 // replaceFirst, UText mode
1824 //
replaceFirst(UText * replacement,UText * dest,UErrorCode & status)1825 UText *RegexMatcher::replaceFirst(UText *replacement, UText *dest, UErrorCode &status) {
1826 if (U_FAILURE(status)) {
1827 return dest;
1828 }
1829 if (U_FAILURE(fDeferredStatus)) {
1830 status = fDeferredStatus;
1831 return dest;
1832 }
1833
1834 reset();
1835 if (!find()) {
1836 return getInput(dest, status);
1837 }
1838
1839 if (dest == NULL) {
1840 UnicodeString emptyString;
1841 UText empty = UTEXT_INITIALIZER;
1842
1843 utext_openUnicodeString(&empty, &emptyString, &status);
1844 dest = utext_clone(NULL, &empty, TRUE, FALSE, &status);
1845 utext_close(&empty);
1846 }
1847
1848 appendReplacement(dest, replacement, status);
1849 appendTail(dest, status);
1850
1851 return dest;
1852 }
1853
1854
1855 //--------------------------------------------------------------------------------
1856 //
1857 // requireEnd
1858 //
1859 //--------------------------------------------------------------------------------
requireEnd() const1860 UBool RegexMatcher::requireEnd() const {
1861 return fRequireEnd;
1862 }
1863
1864
1865 //--------------------------------------------------------------------------------
1866 //
1867 // reset
1868 //
1869 //--------------------------------------------------------------------------------
reset()1870 RegexMatcher &RegexMatcher::reset() {
1871 fRegionStart = 0;
1872 fRegionLimit = fInputLength;
1873 fActiveStart = 0;
1874 fActiveLimit = fInputLength;
1875 fAnchorStart = 0;
1876 fAnchorLimit = fInputLength;
1877 fLookStart = 0;
1878 fLookLimit = fInputLength;
1879 resetPreserveRegion();
1880 return *this;
1881 }
1882
1883
1884
resetPreserveRegion()1885 void RegexMatcher::resetPreserveRegion() {
1886 fMatchStart = 0;
1887 fMatchEnd = 0;
1888 fLastMatchEnd = -1;
1889 fAppendPosition = 0;
1890 fMatch = FALSE;
1891 fHitEnd = FALSE;
1892 fRequireEnd = FALSE;
1893 fTime = 0;
1894 fTickCounter = TIMER_INITIAL_VALUE;
1895 //resetStack(); // more expensive than it looks...
1896 }
1897
1898
reset(const UnicodeString & input)1899 RegexMatcher &RegexMatcher::reset(const UnicodeString &input) {
1900 fInputText = utext_openConstUnicodeString(fInputText, &input, &fDeferredStatus);
1901 if (fPattern->fNeedsAltInput) {
1902 fAltInputText = utext_clone(fAltInputText, fInputText, FALSE, TRUE, &fDeferredStatus);
1903 }
1904 fInputLength = utext_nativeLength(fInputText);
1905
1906 reset();
1907 delete fInput;
1908 fInput = NULL;
1909
1910 // Do the following for any UnicodeString.
1911 // This is for compatibility for those clients who modify the input string "live" during regex operations.
1912 fInputUniStrMaybeMutable = TRUE;
1913
1914 if (fWordBreakItr != NULL) {
1915 #if UCONFIG_NO_BREAK_ITERATION==0
1916 UErrorCode status = U_ZERO_ERROR;
1917 fWordBreakItr->setText(fInputText, status);
1918 #endif
1919 }
1920 return *this;
1921 }
1922
1923
reset(UText * input)1924 RegexMatcher &RegexMatcher::reset(UText *input) {
1925 if (fInputText != input) {
1926 fInputText = utext_clone(fInputText, input, FALSE, TRUE, &fDeferredStatus);
1927 if (fPattern->fNeedsAltInput) fAltInputText = utext_clone(fAltInputText, fInputText, FALSE, TRUE, &fDeferredStatus);
1928 fInputLength = utext_nativeLength(fInputText);
1929
1930 delete fInput;
1931 fInput = NULL;
1932
1933 if (fWordBreakItr != NULL) {
1934 #if UCONFIG_NO_BREAK_ITERATION==0
1935 UErrorCode status = U_ZERO_ERROR;
1936 fWordBreakItr->setText(input, status);
1937 #endif
1938 }
1939 }
1940 reset();
1941 fInputUniStrMaybeMutable = FALSE;
1942
1943 return *this;
1944 }
1945
1946 /*RegexMatcher &RegexMatcher::reset(const UChar *) {
1947 fDeferredStatus = U_INTERNAL_PROGRAM_ERROR;
1948 return *this;
1949 }*/
1950
reset(int64_t position,UErrorCode & status)1951 RegexMatcher &RegexMatcher::reset(int64_t position, UErrorCode &status) {
1952 if (U_FAILURE(status)) {
1953 return *this;
1954 }
1955 reset(); // Reset also resets the region to be the entire string.
1956
1957 if (position < 0 || position > fActiveLimit) {
1958 status = U_INDEX_OUTOFBOUNDS_ERROR;
1959 return *this;
1960 }
1961 fMatchEnd = position;
1962 return *this;
1963 }
1964
1965
1966 // BEGIN android-added
1967 // Removed this function after Android upgrad to ICU4.8.
1968 //--------------------------------------------------------------------------------
1969 //
1970 // refresh
1971 //
1972 //--------------------------------------------------------------------------------
refreshInputText(UText * input,UErrorCode & status)1973 RegexMatcher &RegexMatcher::refreshInputText(UText *input, UErrorCode &status) {
1974 if (U_FAILURE(status)) {
1975 return *this;
1976 }
1977 if (input == NULL) {
1978 status = U_ILLEGAL_ARGUMENT_ERROR;
1979 return *this;
1980 }
1981 if (utext_nativeLength(fInputText) != utext_nativeLength(input)) {
1982 status = U_ILLEGAL_ARGUMENT_ERROR;
1983 return *this;
1984 }
1985 int64_t pos = utext_getNativeIndex(fInputText);
1986 // Shallow read-only clone of the new UText into the existing input UText
1987 fInputText = utext_clone(fInputText, input, FALSE, TRUE, &status);
1988 if (U_FAILURE(status)) {
1989 return *this;
1990 }
1991 utext_setNativeIndex(fInputText, pos);
1992
1993 if (fAltInputText != NULL) {
1994 pos = utext_getNativeIndex(fAltInputText);
1995 fAltInputText = utext_clone(fAltInputText, input, FALSE, TRUE, &status);
1996 if (U_FAILURE(status)) {
1997 return *this;
1998 }
1999 utext_setNativeIndex(fAltInputText, pos);
2000 }
2001 return *this;
2002 }
2003 // END android-added
2004
2005
2006 //--------------------------------------------------------------------------------
2007 //
2008 // setTrace
2009 //
2010 //--------------------------------------------------------------------------------
setTrace(UBool state)2011 void RegexMatcher::setTrace(UBool state) {
2012 fTraceDebug = state;
2013 }
2014
2015
2016
2017 //---------------------------------------------------------------------
2018 //
2019 // split
2020 //
2021 //---------------------------------------------------------------------
split(const UnicodeString & input,UnicodeString dest[],int32_t destCapacity,UErrorCode & status)2022 int32_t RegexMatcher::split(const UnicodeString &input,
2023 UnicodeString dest[],
2024 int32_t destCapacity,
2025 UErrorCode &status)
2026 {
2027 UText inputText = UTEXT_INITIALIZER;
2028 utext_openConstUnicodeString(&inputText, &input, &status);
2029 if (U_FAILURE(status)) {
2030 return 0;
2031 }
2032
2033 UText **destText = (UText **)uprv_malloc(sizeof(UText*)*destCapacity);
2034 if (destText == NULL) {
2035 status = U_MEMORY_ALLOCATION_ERROR;
2036 return 0;
2037 }
2038 int32_t i;
2039 for (i = 0; i < destCapacity; i++) {
2040 destText[i] = utext_openUnicodeString(NULL, &dest[i], &status);
2041 }
2042
2043 int32_t fieldCount = split(&inputText, destText, destCapacity, status);
2044
2045 for (i = 0; i < destCapacity; i++) {
2046 utext_close(destText[i]);
2047 }
2048
2049 uprv_free(destText);
2050 utext_close(&inputText);
2051 return fieldCount;
2052 }
2053
2054 //
2055 // split, UText mode
2056 //
split(UText * input,UText * dest[],int32_t destCapacity,UErrorCode & status)2057 int32_t RegexMatcher::split(UText *input,
2058 UText *dest[],
2059 int32_t destCapacity,
2060 UErrorCode &status)
2061 {
2062 //
2063 // Check arguements for validity
2064 //
2065 if (U_FAILURE(status)) {
2066 return 0;
2067 };
2068
2069 if (destCapacity < 1) {
2070 status = U_ILLEGAL_ARGUMENT_ERROR;
2071 return 0;
2072 }
2073
2074 //
2075 // Reset for the input text
2076 //
2077 reset(input);
2078 int64_t nextOutputStringStart = 0;
2079 if (fActiveLimit == 0) {
2080 return 0;
2081 }
2082
2083 //
2084 // Loop through the input text, searching for the delimiter pattern
2085 //
2086 int32_t i;
2087 int32_t numCaptureGroups = fPattern->fGroupMap->size();
2088 for (i=0; ; i++) {
2089 if (i>=destCapacity-1) {
2090 // There is one or zero output string left.
2091 // Fill the last output string with whatever is left from the input, then exit the loop.
2092 // ( i will be == destCapacity if we filled the output array while processing
2093 // capture groups of the delimiter expression, in which case we will discard the
2094 // last capture group saved in favor of the unprocessed remainder of the
2095 // input string.)
2096 i = destCapacity-1;
2097 if (fActiveLimit > nextOutputStringStart) {
2098 if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) {
2099 if (dest[i]) {
2100 utext_replace(dest[i], 0, utext_nativeLength(dest[i]),
2101 input->chunkContents+nextOutputStringStart,
2102 (int32_t)(fActiveLimit-nextOutputStringStart), &status);
2103 } else {
2104 UText remainingText = UTEXT_INITIALIZER;
2105 utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart,
2106 fActiveLimit-nextOutputStringStart, &status);
2107 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2108 utext_close(&remainingText);
2109 }
2110 } else {
2111 UErrorCode lengthStatus = U_ZERO_ERROR;
2112 int32_t remaining16Length =
2113 utext_extract(input, nextOutputStringStart, fActiveLimit, NULL, 0, &lengthStatus);
2114 UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1));
2115 if (remainingChars == NULL) {
2116 status = U_MEMORY_ALLOCATION_ERROR;
2117 break;
2118 }
2119
2120 utext_extract(input, nextOutputStringStart, fActiveLimit, remainingChars, remaining16Length+1, &status);
2121 if (dest[i]) {
2122 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status);
2123 } else {
2124 UText remainingText = UTEXT_INITIALIZER;
2125 utext_openUChars(&remainingText, remainingChars, remaining16Length, &status);
2126 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2127 utext_close(&remainingText);
2128 }
2129
2130 uprv_free(remainingChars);
2131 }
2132 }
2133 break;
2134 }
2135 if (find()) {
2136 // We found another delimiter. Move everything from where we started looking
2137 // up until the start of the delimiter into the next output string.
2138 if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) {
2139 if (dest[i]) {
2140 utext_replace(dest[i], 0, utext_nativeLength(dest[i]),
2141 input->chunkContents+nextOutputStringStart,
2142 (int32_t)(fMatchStart-nextOutputStringStart), &status);
2143 } else {
2144 UText remainingText = UTEXT_INITIALIZER;
2145 utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart,
2146 fMatchStart-nextOutputStringStart, &status);
2147 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2148 utext_close(&remainingText);
2149 }
2150 } else {
2151 UErrorCode lengthStatus = U_ZERO_ERROR;
2152 int32_t remaining16Length = utext_extract(input, nextOutputStringStart, fMatchStart, NULL, 0, &lengthStatus);
2153 UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1));
2154 if (remainingChars == NULL) {
2155 status = U_MEMORY_ALLOCATION_ERROR;
2156 break;
2157 }
2158 utext_extract(input, nextOutputStringStart, fMatchStart, remainingChars, remaining16Length+1, &status);
2159 if (dest[i]) {
2160 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status);
2161 } else {
2162 UText remainingText = UTEXT_INITIALIZER;
2163 utext_openUChars(&remainingText, remainingChars, remaining16Length, &status);
2164 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2165 utext_close(&remainingText);
2166 }
2167
2168 uprv_free(remainingChars);
2169 }
2170 nextOutputStringStart = fMatchEnd;
2171
2172 // If the delimiter pattern has capturing parentheses, the captured
2173 // text goes out into the next n destination strings.
2174 int32_t groupNum;
2175 UBool lastGroupWasNullUText = FALSE;
2176 for (groupNum=1; groupNum<=numCaptureGroups; groupNum++) {
2177 if (i==destCapacity-1) {
2178 break;
2179 }
2180 i++;
2181 lastGroupWasNullUText = (dest[i] == NULL ? TRUE : FALSE);
2182 dest[i] = group(groupNum, dest[i], status);
2183 }
2184
2185 if (nextOutputStringStart == fActiveLimit) {
2186 // The delimiter was at the end of the string. We're done.
2187 break;
2188 } else if (i == destCapacity-1) {
2189 // We're out of capture groups, and the rest of the string is more important
2190 if (lastGroupWasNullUText) {
2191 utext_close(dest[i]);
2192 dest[i] = NULL;
2193 }
2194 }
2195
2196 }
2197 else
2198 {
2199 // We ran off the end of the input while looking for the next delimiter.
2200 // All the remaining text goes into the current output string.
2201 if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) {
2202 if (dest[i]) {
2203 utext_replace(dest[i], 0, utext_nativeLength(dest[i]),
2204 input->chunkContents+nextOutputStringStart,
2205 (int32_t)(fActiveLimit-nextOutputStringStart), &status);
2206 } else {
2207 UText remainingText = UTEXT_INITIALIZER;
2208 utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart,
2209 fActiveLimit-nextOutputStringStart, &status);
2210 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2211 utext_close(&remainingText);
2212 }
2213 } else {
2214 UErrorCode lengthStatus = U_ZERO_ERROR;
2215 int32_t remaining16Length = utext_extract(input, nextOutputStringStart, fActiveLimit, NULL, 0, &lengthStatus);
2216 UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1));
2217 if (remainingChars == NULL) {
2218 status = U_MEMORY_ALLOCATION_ERROR;
2219 break;
2220 }
2221
2222 utext_extract(input, nextOutputStringStart, fActiveLimit, remainingChars, remaining16Length+1, &status);
2223 if (dest[i]) {
2224 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status);
2225 } else {
2226 UText remainingText = UTEXT_INITIALIZER;
2227 utext_openUChars(&remainingText, remainingChars, remaining16Length, &status);
2228 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2229 utext_close(&remainingText);
2230 }
2231
2232 uprv_free(remainingChars);
2233 }
2234 break;
2235 }
2236 if (U_FAILURE(status)) {
2237 break;
2238 }
2239 } // end of for loop
2240 return i+1;
2241 }
2242
2243
2244 //--------------------------------------------------------------------------------
2245 //
2246 // start
2247 //
2248 //--------------------------------------------------------------------------------
start(UErrorCode & status) const2249 int32_t RegexMatcher::start(UErrorCode &status) const {
2250 return start(0, status);
2251 }
2252
start64(UErrorCode & status) const2253 int64_t RegexMatcher::start64(UErrorCode &status) const {
2254 return start64(0, status);
2255 }
2256
2257 //--------------------------------------------------------------------------------
2258 //
2259 // start(int32_t group, UErrorCode &status)
2260 //
2261 //--------------------------------------------------------------------------------
2262
start64(int32_t group,UErrorCode & status) const2263 int64_t RegexMatcher::start64(int32_t group, UErrorCode &status) const {
2264 if (U_FAILURE(status)) {
2265 return -1;
2266 }
2267 if (U_FAILURE(fDeferredStatus)) {
2268 status = fDeferredStatus;
2269 return -1;
2270 }
2271 if (fMatch == FALSE) {
2272 status = U_REGEX_INVALID_STATE;
2273 return -1;
2274 }
2275 if (group < 0 || group > fPattern->fGroupMap->size()) {
2276 status = U_INDEX_OUTOFBOUNDS_ERROR;
2277 return -1;
2278 }
2279 int64_t s;
2280 if (group == 0) {
2281 s = fMatchStart;
2282 } else {
2283 int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1);
2284 U_ASSERT(groupOffset < fPattern->fFrameSize);
2285 U_ASSERT(groupOffset >= 0);
2286 s = fFrame->fExtra[groupOffset];
2287 }
2288
2289 return s;
2290 }
2291
2292
start(int32_t group,UErrorCode & status) const2293 int32_t RegexMatcher::start(int32_t group, UErrorCode &status) const {
2294 return (int32_t)start64(group, status);
2295 }
2296
2297 //--------------------------------------------------------------------------------
2298 //
2299 // useAnchoringBounds
2300 //
2301 //--------------------------------------------------------------------------------
useAnchoringBounds(UBool b)2302 RegexMatcher &RegexMatcher::useAnchoringBounds(UBool b) {
2303 fAnchoringBounds = b;
2304 fAnchorStart = (fAnchoringBounds ? fRegionStart : 0);
2305 fAnchorLimit = (fAnchoringBounds ? fRegionLimit : fInputLength);
2306 return *this;
2307 }
2308
2309
2310 //--------------------------------------------------------------------------------
2311 //
2312 // useTransparentBounds
2313 //
2314 //--------------------------------------------------------------------------------
useTransparentBounds(UBool b)2315 RegexMatcher &RegexMatcher::useTransparentBounds(UBool b) {
2316 fTransparentBounds = b;
2317 fLookStart = (fTransparentBounds ? 0 : fRegionStart);
2318 fLookLimit = (fTransparentBounds ? fInputLength : fRegionLimit);
2319 return *this;
2320 }
2321
2322 //--------------------------------------------------------------------------------
2323 //
2324 // setTimeLimit
2325 //
2326 //--------------------------------------------------------------------------------
setTimeLimit(int32_t limit,UErrorCode & status)2327 void RegexMatcher::setTimeLimit(int32_t limit, UErrorCode &status) {
2328 if (U_FAILURE(status)) {
2329 return;
2330 }
2331 if (U_FAILURE(fDeferredStatus)) {
2332 status = fDeferredStatus;
2333 return;
2334 }
2335 if (limit < 0) {
2336 status = U_ILLEGAL_ARGUMENT_ERROR;
2337 return;
2338 }
2339 fTimeLimit = limit;
2340 }
2341
2342
2343 //--------------------------------------------------------------------------------
2344 //
2345 // getTimeLimit
2346 //
2347 //--------------------------------------------------------------------------------
getTimeLimit() const2348 int32_t RegexMatcher::getTimeLimit() const {
2349 return fTimeLimit;
2350 }
2351
2352
2353 //--------------------------------------------------------------------------------
2354 //
2355 // setStackLimit
2356 //
2357 //--------------------------------------------------------------------------------
setStackLimit(int32_t limit,UErrorCode & status)2358 void RegexMatcher::setStackLimit(int32_t limit, UErrorCode &status) {
2359 if (U_FAILURE(status)) {
2360 return;
2361 }
2362 if (U_FAILURE(fDeferredStatus)) {
2363 status = fDeferredStatus;
2364 return;
2365 }
2366 if (limit < 0) {
2367 status = U_ILLEGAL_ARGUMENT_ERROR;
2368 return;
2369 }
2370
2371 // Reset the matcher. This is needed here in case there is a current match
2372 // whose final stack frame (containing the match results, pointed to by fFrame)
2373 // would be lost by resizing to a smaller stack size.
2374 reset();
2375
2376 if (limit == 0) {
2377 // Unlimited stack expansion
2378 fStack->setMaxCapacity(0);
2379 } else {
2380 // Change the units of the limit from bytes to ints, and bump the size up
2381 // to be big enough to hold at least one stack frame for the pattern,
2382 // if it isn't there already.
2383 int32_t adjustedLimit = limit / sizeof(int32_t);
2384 if (adjustedLimit < fPattern->fFrameSize) {
2385 adjustedLimit = fPattern->fFrameSize;
2386 }
2387 fStack->setMaxCapacity(adjustedLimit);
2388 }
2389 fStackLimit = limit;
2390 }
2391
2392
2393 //--------------------------------------------------------------------------------
2394 //
2395 // getStackLimit
2396 //
2397 //--------------------------------------------------------------------------------
getStackLimit() const2398 int32_t RegexMatcher::getStackLimit() const {
2399 return fStackLimit;
2400 }
2401
2402
2403 //--------------------------------------------------------------------------------
2404 //
2405 // setMatchCallback
2406 //
2407 //--------------------------------------------------------------------------------
setMatchCallback(URegexMatchCallback * callback,const void * context,UErrorCode & status)2408 void RegexMatcher::setMatchCallback(URegexMatchCallback *callback,
2409 const void *context,
2410 UErrorCode &status) {
2411 if (U_FAILURE(status)) {
2412 return;
2413 }
2414 fCallbackFn = callback;
2415 fCallbackContext = context;
2416 }
2417
2418
2419 //--------------------------------------------------------------------------------
2420 //
2421 // getMatchCallback
2422 //
2423 //--------------------------------------------------------------------------------
getMatchCallback(URegexMatchCallback * & callback,const void * & context,UErrorCode & status)2424 void RegexMatcher::getMatchCallback(URegexMatchCallback *&callback,
2425 const void *&context,
2426 UErrorCode &status) {
2427 if (U_FAILURE(status)) {
2428 return;
2429 }
2430 callback = fCallbackFn;
2431 context = fCallbackContext;
2432 }
2433
2434
2435 //--------------------------------------------------------------------------------
2436 //
2437 // setMatchCallback
2438 //
2439 //--------------------------------------------------------------------------------
setFindProgressCallback(URegexFindProgressCallback * callback,const void * context,UErrorCode & status)2440 void RegexMatcher::setFindProgressCallback(URegexFindProgressCallback *callback,
2441 const void *context,
2442 UErrorCode &status) {
2443 if (U_FAILURE(status)) {
2444 return;
2445 }
2446 fFindProgressCallbackFn = callback;
2447 fFindProgressCallbackContext = context;
2448 }
2449
2450
2451 //--------------------------------------------------------------------------------
2452 //
2453 // getMatchCallback
2454 //
2455 //--------------------------------------------------------------------------------
getFindProgressCallback(URegexFindProgressCallback * & callback,const void * & context,UErrorCode & status)2456 void RegexMatcher::getFindProgressCallback(URegexFindProgressCallback *&callback,
2457 const void *&context,
2458 UErrorCode &status) {
2459 if (U_FAILURE(status)) {
2460 return;
2461 }
2462 callback = fFindProgressCallbackFn;
2463 context = fFindProgressCallbackContext;
2464 }
2465
2466
2467 //================================================================================
2468 //
2469 // Code following this point in this file is the internal
2470 // Match Engine Implementation.
2471 //
2472 //================================================================================
2473
2474
2475 //--------------------------------------------------------------------------------
2476 //
2477 // resetStack
2478 // Discard any previous contents of the state save stack, and initialize a
2479 // new stack frame to all -1. The -1s are needed for capture group limits,
2480 // where they indicate that a group has not yet matched anything.
2481 //--------------------------------------------------------------------------------
resetStack()2482 REStackFrame *RegexMatcher::resetStack() {
2483 // Discard any previous contents of the state save stack, and initialize a
2484 // new stack frame with all -1 data. The -1s are needed for capture group limits,
2485 // where they indicate that a group has not yet matched anything.
2486 fStack->removeAllElements();
2487
2488 REStackFrame *iFrame = (REStackFrame *)fStack->reserveBlock(fPattern->fFrameSize, fDeferredStatus);
2489 int32_t i;
2490 for (i=0; i<fPattern->fFrameSize-RESTACKFRAME_HDRCOUNT; i++) {
2491 iFrame->fExtra[i] = -1;
2492 }
2493 return iFrame;
2494 }
2495
2496
2497
2498 //--------------------------------------------------------------------------------
2499 //
2500 // isWordBoundary
2501 // in perl, "xab..cd..", \b is true at positions 0,3,5,7
2502 // For us,
2503 // If the current char is a combining mark,
2504 // \b is FALSE.
2505 // Else Scan backwards to the first non-combining char.
2506 // We are at a boundary if the this char and the original chars are
2507 // opposite in membership in \w set
2508 //
2509 // parameters: pos - the current position in the input buffer
2510 //
2511 // TODO: double-check edge cases at region boundaries.
2512 //
2513 //--------------------------------------------------------------------------------
isWordBoundary(int64_t pos)2514 UBool RegexMatcher::isWordBoundary(int64_t pos) {
2515 UBool isBoundary = FALSE;
2516 UBool cIsWord = FALSE;
2517
2518 if (pos >= fLookLimit) {
2519 fHitEnd = TRUE;
2520 } else {
2521 // Determine whether char c at current position is a member of the word set of chars.
2522 // If we're off the end of the string, behave as though we're not at a word char.
2523 UTEXT_SETNATIVEINDEX(fInputText, pos);
2524 UChar32 c = UTEXT_CURRENT32(fInputText);
2525 if (u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND) || u_charType(c) == U_FORMAT_CHAR) {
2526 // Current char is a combining one. Not a boundary.
2527 return FALSE;
2528 }
2529 cIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(c);
2530 }
2531
2532 // Back up until we come to a non-combining char, determine whether
2533 // that char is a word char.
2534 UBool prevCIsWord = FALSE;
2535 for (;;) {
2536 if (UTEXT_GETNATIVEINDEX(fInputText) <= fLookStart) {
2537 break;
2538 }
2539 UChar32 prevChar = UTEXT_PREVIOUS32(fInputText);
2540 if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND)
2541 || u_charType(prevChar) == U_FORMAT_CHAR)) {
2542 prevCIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(prevChar);
2543 break;
2544 }
2545 }
2546 isBoundary = cIsWord ^ prevCIsWord;
2547 return isBoundary;
2548 }
2549
isChunkWordBoundary(int32_t pos)2550 UBool RegexMatcher::isChunkWordBoundary(int32_t pos) {
2551 UBool isBoundary = FALSE;
2552 UBool cIsWord = FALSE;
2553
2554 const UChar *inputBuf = fInputText->chunkContents;
2555
2556 if (pos >= fLookLimit) {
2557 fHitEnd = TRUE;
2558 } else {
2559 // Determine whether char c at current position is a member of the word set of chars.
2560 // If we're off the end of the string, behave as though we're not at a word char.
2561 UChar32 c;
2562 U16_GET(inputBuf, fLookStart, pos, fLookLimit, c);
2563 if (u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND) || u_charType(c) == U_FORMAT_CHAR) {
2564 // Current char is a combining one. Not a boundary.
2565 return FALSE;
2566 }
2567 cIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(c);
2568 }
2569
2570 // Back up until we come to a non-combining char, determine whether
2571 // that char is a word char.
2572 UBool prevCIsWord = FALSE;
2573 for (;;) {
2574 if (pos <= fLookStart) {
2575 break;
2576 }
2577 UChar32 prevChar;
2578 U16_PREV(inputBuf, fLookStart, pos, prevChar);
2579 if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND)
2580 || u_charType(prevChar) == U_FORMAT_CHAR)) {
2581 prevCIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(prevChar);
2582 break;
2583 }
2584 }
2585 isBoundary = cIsWord ^ prevCIsWord;
2586 return isBoundary;
2587 }
2588
2589 //--------------------------------------------------------------------------------
2590 //
2591 // isUWordBoundary
2592 //
2593 // Test for a word boundary using RBBI word break.
2594 //
2595 // parameters: pos - the current position in the input buffer
2596 //
2597 //--------------------------------------------------------------------------------
isUWordBoundary(int64_t pos)2598 UBool RegexMatcher::isUWordBoundary(int64_t pos) {
2599 UBool returnVal = FALSE;
2600 #if UCONFIG_NO_BREAK_ITERATION==0
2601
2602 // If we haven't yet created a break iterator for this matcher, do it now.
2603 if (fWordBreakItr == NULL) {
2604 fWordBreakItr =
2605 (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), fDeferredStatus);
2606 if (U_FAILURE(fDeferredStatus)) {
2607 return FALSE;
2608 }
2609 fWordBreakItr->setText(fInputText, fDeferredStatus);
2610 }
2611
2612 if (pos >= fLookLimit) {
2613 fHitEnd = TRUE;
2614 returnVal = TRUE; // With Unicode word rules, only positions within the interior of "real"
2615 // words are not boundaries. All non-word chars stand by themselves,
2616 // with word boundaries on both sides.
2617 } else {
2618 if (!UTEXT_USES_U16(fInputText)) {
2619 // !!!: Would like a better way to do this!
2620 UErrorCode status = U_ZERO_ERROR;
2621 pos = utext_extract(fInputText, 0, pos, NULL, 0, &status);
2622 }
2623 returnVal = fWordBreakItr->isBoundary((int32_t)pos);
2624 }
2625 #endif
2626 return returnVal;
2627 }
2628
2629 //--------------------------------------------------------------------------------
2630 //
2631 // IncrementTime This function is called once each TIMER_INITIAL_VALUE state
2632 // saves. Increment the "time" counter, and call the
2633 // user callback function if there is one installed.
2634 //
2635 // If the match operation needs to be aborted, either for a time-out
2636 // or because the user callback asked for it, just set an error status.
2637 // The engine will pick that up and stop in its outer loop.
2638 //
2639 //--------------------------------------------------------------------------------
IncrementTime(UErrorCode & status)2640 void RegexMatcher::IncrementTime(UErrorCode &status) {
2641 fTickCounter = TIMER_INITIAL_VALUE;
2642 fTime++;
2643 if (fCallbackFn != NULL) {
2644 if ((*fCallbackFn)(fCallbackContext, fTime) == FALSE) {
2645 status = U_REGEX_STOPPED_BY_CALLER;
2646 return;
2647 }
2648 }
2649 if (fTimeLimit > 0 && fTime >= fTimeLimit) {
2650 status = U_REGEX_TIME_OUT;
2651 }
2652 }
2653
2654 //--------------------------------------------------------------------------------
2655 //
2656 // ReportFindProgress This function is called once for each advance in the target
2657 // string from the find() function, and calls the user progress callback
2658 // function if there is one installed.
2659 //
2660 // NOTE:
2661 //
2662 // If the match operation needs to be aborted because the user
2663 // callback asked for it, just set an error status.
2664 // The engine will pick that up and stop in its outer loop.
2665 //
2666 //--------------------------------------------------------------------------------
ReportFindProgress(int64_t matchIndex,UErrorCode & status)2667 UBool RegexMatcher::ReportFindProgress(int64_t matchIndex, UErrorCode &status) {
2668 if (fFindProgressCallbackFn != NULL) {
2669 if ((*fFindProgressCallbackFn)(fFindProgressCallbackContext, matchIndex) == FALSE) {
2670 status = U_ZERO_ERROR /*U_REGEX_STOPPED_BY_CALLER*/;
2671 return FALSE;
2672 }
2673 }
2674 return TRUE;
2675 }
2676
2677 //--------------------------------------------------------------------------------
2678 //
2679 // StateSave
2680 // Make a new stack frame, initialized as a copy of the current stack frame.
2681 // Set the pattern index in the original stack frame from the operand value
2682 // in the opcode. Execution of the engine continues with the state in
2683 // the newly created stack frame
2684 //
2685 // Note that reserveBlock() may grow the stack, resulting in the
2686 // whole thing being relocated in memory.
2687 //
2688 // Parameters:
2689 // fp The top frame pointer when called. At return, a new
2690 // fame will be present
2691 // savePatIdx An index into the compiled pattern. Goes into the original
2692 // (not new) frame. If execution ever back-tracks out of the
2693 // new frame, this will be where we continue from in the pattern.
2694 // Return
2695 // The new frame pointer.
2696 //
2697 //--------------------------------------------------------------------------------
StateSave(REStackFrame * fp,int64_t savePatIdx,UErrorCode & status)2698 inline REStackFrame *RegexMatcher::StateSave(REStackFrame *fp, int64_t savePatIdx, UErrorCode &status) {
2699 // push storage for a new frame.
2700 int64_t *newFP = fStack->reserveBlock(fFrameSize, status);
2701 if (newFP == NULL) {
2702 // Failure on attempted stack expansion.
2703 // Stack function set some other error code, change it to a more
2704 // specific one for regular expressions.
2705 status = U_REGEX_STACK_OVERFLOW;
2706 // We need to return a writable stack frame, so just return the
2707 // previous frame. The match operation will stop quickly
2708 // because of the error status, after which the frame will never
2709 // be looked at again.
2710 return fp;
2711 }
2712 fp = (REStackFrame *)(newFP - fFrameSize); // in case of realloc of stack.
2713
2714 // New stack frame = copy of old top frame.
2715 int64_t *source = (int64_t *)fp;
2716 int64_t *dest = newFP;
2717 for (;;) {
2718 *dest++ = *source++;
2719 if (source == newFP) {
2720 break;
2721 }
2722 }
2723
2724 fTickCounter--;
2725 if (fTickCounter <= 0) {
2726 IncrementTime(status); // Re-initializes fTickCounter
2727 }
2728 fp->fPatIdx = savePatIdx;
2729 return (REStackFrame *)newFP;
2730 }
2731
2732
2733 //--------------------------------------------------------------------------------
2734 //
2735 // MatchAt This is the actual matching engine.
2736 //
2737 // startIdx: begin matching a this index.
2738 // toEnd: if true, match must extend to end of the input region
2739 //
2740 //--------------------------------------------------------------------------------
MatchAt(int64_t startIdx,UBool toEnd,UErrorCode & status)2741 void RegexMatcher::MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status) {
2742 UBool isMatch = FALSE; // True if the we have a match.
2743
2744 int64_t backSearchIndex = U_INT64_MAX; // used after greedy single-character matches for searching backwards
2745
2746 int32_t op; // Operation from the compiled pattern, split into
2747 int32_t opType; // the opcode
2748 int32_t opValue; // and the operand value.
2749
2750 #ifdef REGEX_RUN_DEBUG
2751 if (fTraceDebug)
2752 {
2753 printf("MatchAt(startIdx=%ld)\n", startIdx);
2754 printf("Original Pattern: ");
2755 UChar32 c = utext_next32From(fPattern->fPattern, 0);
2756 while (c != U_SENTINEL) {
2757 if (c<32 || c>256) {
2758 c = '.';
2759 }
2760 REGEX_DUMP_DEBUG_PRINTF(("%c", c));
2761
2762 c = UTEXT_NEXT32(fPattern->fPattern);
2763 }
2764 printf("\n");
2765 printf("Input String: ");
2766 c = utext_next32From(fInputText, 0);
2767 while (c != U_SENTINEL) {
2768 if (c<32 || c>256) {
2769 c = '.';
2770 }
2771 printf("%c", c);
2772
2773 c = UTEXT_NEXT32(fInputText);
2774 }
2775 printf("\n");
2776 printf("\n");
2777 }
2778 #endif
2779
2780 if (U_FAILURE(status)) {
2781 return;
2782 }
2783
2784 // Cache frequently referenced items from the compiled pattern
2785 //
2786 int64_t *pat = fPattern->fCompiledPat->getBuffer();
2787
2788 const UChar *litText = fPattern->fLiteralText.getBuffer();
2789 UVector *sets = fPattern->fSets;
2790
2791 fFrameSize = fPattern->fFrameSize;
2792 REStackFrame *fp = resetStack();
2793
2794 fp->fPatIdx = 0;
2795 fp->fInputIdx = startIdx;
2796
2797 // Zero out the pattern's static data
2798 int32_t i;
2799 for (i = 0; i<fPattern->fDataSize; i++) {
2800 fData[i] = 0;
2801 }
2802
2803 //
2804 // Main loop for interpreting the compiled pattern.
2805 // One iteration of the loop per pattern operation performed.
2806 //
2807 for (;;) {
2808 #if 0
2809 if (_heapchk() != _HEAPOK) {
2810 fprintf(stderr, "Heap Trouble\n");
2811 }
2812 #endif
2813
2814 op = (int32_t)pat[fp->fPatIdx];
2815 opType = URX_TYPE(op);
2816 opValue = URX_VAL(op);
2817 #ifdef REGEX_RUN_DEBUG
2818 if (fTraceDebug) {
2819 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2820 printf("inputIdx=%d inputChar=%x sp=%3d activeLimit=%d ", fp->fInputIdx,
2821 UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(), fActiveLimit);
2822 fPattern->dumpOp(fp->fPatIdx);
2823 }
2824 #endif
2825 fp->fPatIdx++;
2826
2827 switch (opType) {
2828
2829
2830 case URX_NOP:
2831 break;
2832
2833
2834 case URX_BACKTRACK:
2835 // Force a backtrack. In some circumstances, the pattern compiler
2836 // will notice that the pattern can't possibly match anything, and will
2837 // emit one of these at that point.
2838 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2839 break;
2840
2841
2842 case URX_ONECHAR:
2843 if (fp->fInputIdx < fActiveLimit) {
2844 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2845 UChar32 c = UTEXT_NEXT32(fInputText);
2846 if (c == opValue) {
2847 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
2848 break;
2849 }
2850 } else {
2851 fHitEnd = TRUE;
2852 }
2853
2854 #ifdef REGEX_SMART_BACKTRACKING
2855 if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) {
2856 REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
2857 if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
2858 UBool success = FALSE;
2859 UChar32 c = UTEXT_PREVIOUS32(fInputText);
2860 while (UTEXT_GETNATIVEINDEX(fInputText) >= backSearchIndex) {
2861 if (c == opValue) {
2862 success = TRUE;
2863 break;
2864 } else if (c == U_SENTINEL) {
2865 break;
2866 }
2867 c = UTEXT_PREVIOUS32(fInputText);
2868 }
2869 if (success) {
2870 fHitEnd = FALSE;
2871 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2872 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
2873 if (fp->fInputIdx > backSearchIndex) {
2874 fp = StateSave(fp, fp->fPatIdx, status);
2875 }
2876 fp->fPatIdx++; // Skip the LOOP_C, we just did that
2877 break;
2878 }
2879 }
2880 }
2881 #endif
2882
2883 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2884 break;
2885
2886
2887 case URX_STRING:
2888 {
2889 // Test input against a literal string.
2890 // Strings require two slots in the compiled pattern, one for the
2891 // offset to the string text, and one for the length.
2892 int32_t stringStartIdx = opValue;
2893 int32_t stringLen;
2894
2895 op = (int32_t)pat[fp->fPatIdx]; // Fetch the second operand
2896 fp->fPatIdx++;
2897 opType = URX_TYPE(op);
2898 stringLen = URX_VAL(op);
2899 U_ASSERT(opType == URX_STRING_LEN);
2900 U_ASSERT(stringLen >= 2);
2901
2902 const UChar *patternChars = litText+stringStartIdx;
2903 const UChar *patternEnd = patternChars+stringLen;
2904
2905 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2906 UChar32 c;
2907 UBool success = TRUE;
2908
2909 while (patternChars < patternEnd && success) {
2910 c = UTEXT_NEXT32(fInputText);
2911
2912 if (c != U_SENTINEL && UTEXT_GETNATIVEINDEX(fInputText) <= fActiveLimit) {
2913 if (U_IS_BMP(c)) {
2914 success = (*patternChars == c);
2915 patternChars += 1;
2916 } else if (patternChars+1 < patternEnd) {
2917 success = (*patternChars == U16_LEAD(c) && *(patternChars+1) == U16_TRAIL(c));
2918 patternChars += 2;
2919 }
2920 } else {
2921 success = FALSE;
2922 fHitEnd = TRUE; // TODO: See ticket 6074
2923 }
2924 }
2925
2926 if (success) {
2927 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
2928 } else {
2929 #ifdef REGEX_SMART_BACKTRACKING
2930 if (fp->fInputIdx > backSearchIndex && fStack->size()) {
2931 REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
2932 if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
2933 // Reset to last start point
2934 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2935 patternChars = litText+stringStartIdx;
2936
2937 // Search backwards for a possible start
2938 do {
2939 c = UTEXT_PREVIOUS32(fInputText);
2940 if (c == U_SENTINEL) {
2941 break;
2942 } else if ((U_IS_BMP(c) && *patternChars == c) ||
2943 (*patternChars == U16_LEAD(c) && *(patternChars+1) == U16_TRAIL(c))) {
2944 success = TRUE;
2945 break;
2946 }
2947 } while (UTEXT_GETNATIVEINDEX(fInputText) >= backSearchIndex);
2948
2949 // And try again
2950 if (success) {
2951 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2952 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
2953 if (fp->fInputIdx > backSearchIndex) {
2954 fp = StateSave(fp, fp->fPatIdx, status);
2955 }
2956 fp->fPatIdx++; // Skip the LOOP_C, we just did that
2957 break;
2958 }
2959 }
2960 }
2961 #endif
2962 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2963 }
2964 }
2965 break;
2966
2967
2968 case URX_STATE_SAVE:
2969 fp = StateSave(fp, opValue, status);
2970 break;
2971
2972
2973 case URX_END:
2974 // The match loop will exit via this path on a successful match,
2975 // when we reach the end of the pattern.
2976 if (toEnd && fp->fInputIdx != fActiveLimit) {
2977 // The pattern matched, but not to the end of input. Try some more.
2978 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2979 break;
2980 }
2981 isMatch = TRUE;
2982 goto breakFromLoop;
2983
2984 // Start and End Capture stack frame variables are laid out out like this:
2985 // fp->fExtra[opValue] - The start of a completed capture group
2986 // opValue+1 - The end of a completed capture group
2987 // opValue+2 - the start of a capture group whose end
2988 // has not yet been reached (and might not ever be).
2989 case URX_START_CAPTURE:
2990 U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
2991 fp->fExtra[opValue+2] = fp->fInputIdx;
2992 break;
2993
2994
2995 case URX_END_CAPTURE:
2996 U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
2997 U_ASSERT(fp->fExtra[opValue+2] >= 0); // Start pos for this group must be set.
2998 fp->fExtra[opValue] = fp->fExtra[opValue+2]; // Tentative start becomes real.
2999 fp->fExtra[opValue+1] = fp->fInputIdx; // End position
3000 U_ASSERT(fp->fExtra[opValue] <= fp->fExtra[opValue+1]);
3001 break;
3002
3003
3004 case URX_DOLLAR: // $, test for End of line
3005 // or for position before new line at end of input
3006 {
3007 if (fp->fInputIdx >= fAnchorLimit) {
3008 // We really are at the end of input. Success.
3009 fHitEnd = TRUE;
3010 fRequireEnd = TRUE;
3011 break;
3012 }
3013
3014 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3015
3016 // If we are positioned just before a new-line that is located at the
3017 // end of input, succeed.
3018 UChar32 c = UTEXT_NEXT32(fInputText);
3019 if (UTEXT_GETNATIVEINDEX(fInputText) >= fAnchorLimit) {
3020 if ((c>=0x0a && c<=0x0d) || c==0x85 || c==0x2028 || c==0x2029) {
3021 // If not in the middle of a CR/LF sequence
3022 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && (UTEXT_PREVIOUS32(fInputText), UTEXT_PREVIOUS32(fInputText))==0x0d)) {
3023 // At new-line at end of input. Success
3024 fHitEnd = TRUE;
3025 fRequireEnd = TRUE;
3026
3027 break;
3028 }
3029 }
3030 } else {
3031 UChar32 nextC = UTEXT_NEXT32(fInputText);
3032 if (c == 0x0d && nextC == 0x0a && UTEXT_GETNATIVEINDEX(fInputText) >= fAnchorLimit) {
3033 fHitEnd = TRUE;
3034 fRequireEnd = TRUE;
3035 break; // At CR/LF at end of input. Success
3036 }
3037 }
3038
3039 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3040 }
3041 break;
3042
3043
3044 case URX_DOLLAR_D: // $, test for End of Line, in UNIX_LINES mode.
3045 if (fp->fInputIdx >= fAnchorLimit) {
3046 // Off the end of input. Success.
3047 fHitEnd = TRUE;
3048 fRequireEnd = TRUE;
3049 break;
3050 } else {
3051 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3052 UChar32 c = UTEXT_NEXT32(fInputText);
3053 // Either at the last character of input, or off the end.
3054 if (c == 0x0a && UTEXT_GETNATIVEINDEX(fInputText) == fAnchorLimit) {
3055 fHitEnd = TRUE;
3056 fRequireEnd = TRUE;
3057 break;
3058 }
3059 }
3060
3061 // Not at end of input. Back-track out.
3062 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3063 break;
3064
3065
3066 case URX_DOLLAR_M: // $, test for End of line in multi-line mode
3067 {
3068 if (fp->fInputIdx >= fAnchorLimit) {
3069 // We really are at the end of input. Success.
3070 fHitEnd = TRUE;
3071 fRequireEnd = TRUE;
3072 break;
3073 }
3074 // If we are positioned just before a new-line, succeed.
3075 // It makes no difference where the new-line is within the input.
3076 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3077 UChar32 c = UTEXT_CURRENT32(fInputText);
3078 if ((c>=0x0a && c<=0x0d) || c==0x85 ||c==0x2028 || c==0x2029) {
3079 // At a line end, except for the odd chance of being in the middle of a CR/LF sequence
3080 // In multi-line mode, hitting a new-line just before the end of input does not
3081 // set the hitEnd or requireEnd flags
3082 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && UTEXT_PREVIOUS32(fInputText)==0x0d)) {
3083 break;
3084 }
3085 }
3086 // not at a new line. Fail.
3087 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3088 }
3089 break;
3090
3091
3092 case URX_DOLLAR_MD: // $, test for End of line in multi-line and UNIX_LINES mode
3093 {
3094 if (fp->fInputIdx >= fAnchorLimit) {
3095 // We really are at the end of input. Success.
3096 fHitEnd = TRUE;
3097 fRequireEnd = TRUE; // Java set requireEnd in this case, even though
3098 break; // adding a new-line would not lose the match.
3099 }
3100 // If we are not positioned just before a new-line, the test fails; backtrack out.
3101 // It makes no difference where the new-line is within the input.
3102 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3103 if (UTEXT_CURRENT32(fInputText) != 0x0a) {
3104 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3105 }
3106 }
3107 break;
3108
3109
3110 case URX_CARET: // ^, test for start of line
3111 if (fp->fInputIdx != fAnchorStart) {
3112 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3113 }
3114 break;
3115
3116
3117 case URX_CARET_M: // ^, test for start of line in mulit-line mode
3118 {
3119 if (fp->fInputIdx == fAnchorStart) {
3120 // We are at the start input. Success.
3121 break;
3122 }
3123 // Check whether character just before the current pos is a new-line
3124 // unless we are at the end of input
3125 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3126 UChar32 c = UTEXT_PREVIOUS32(fInputText);
3127 if ((fp->fInputIdx < fAnchorLimit) &&
3128 ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) {
3129 // It's a new-line. ^ is true. Success.
3130 // TODO: what should be done with positions between a CR and LF?
3131 break;
3132 }
3133 // Not at the start of a line. Fail.
3134 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3135 }
3136 break;
3137
3138
3139 case URX_CARET_M_UNIX: // ^, test for start of line in mulit-line + Unix-line mode
3140 {
3141 U_ASSERT(fp->fInputIdx >= fAnchorStart);
3142 if (fp->fInputIdx <= fAnchorStart) {
3143 // We are at the start input. Success.
3144 break;
3145 }
3146 // Check whether character just before the current pos is a new-line
3147 U_ASSERT(fp->fInputIdx <= fAnchorLimit);
3148 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3149 UChar32 c = UTEXT_PREVIOUS32(fInputText);
3150 if (c != 0x0a) {
3151 // Not at the start of a line. Back-track out.
3152 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3153 }
3154 }
3155 break;
3156
3157 case URX_BACKSLASH_B: // Test for word boundaries
3158 {
3159 UBool success = isWordBoundary(fp->fInputIdx);
3160 success ^= (opValue != 0); // flip sense for \B
3161 if (!success) {
3162 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3163 }
3164 }
3165 break;
3166
3167
3168 case URX_BACKSLASH_BU: // Test for word boundaries, Unicode-style
3169 {
3170 UBool success = isUWordBoundary(fp->fInputIdx);
3171 success ^= (opValue != 0); // flip sense for \B
3172 if (!success) {
3173 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3174 }
3175 }
3176 break;
3177
3178
3179 case URX_BACKSLASH_D: // Test for decimal digit
3180 {
3181 if (fp->fInputIdx >= fActiveLimit) {
3182 fHitEnd = TRUE;
3183 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3184 break;
3185 }
3186
3187 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3188
3189 UChar32 c = UTEXT_NEXT32(fInputText);
3190 int8_t ctype = u_charType(c); // TODO: make a unicode set for this. Will be faster.
3191 UBool success = (ctype == U_DECIMAL_DIGIT_NUMBER);
3192 success ^= (opValue != 0); // flip sense for \D
3193 if (success) {
3194 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3195 } else {
3196 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3197 }
3198 }
3199 break;
3200
3201
3202 case URX_BACKSLASH_G: // Test for position at end of previous match
3203 if (!((fMatch && fp->fInputIdx==fMatchEnd) || (fMatch==FALSE && fp->fInputIdx==fActiveStart))) {
3204 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3205 }
3206 break;
3207
3208
3209 case URX_BACKSLASH_X:
3210 // Match a Grapheme, as defined by Unicode TR 29.
3211 // Differs slightly from Perl, which consumes combining marks independently
3212 // of context.
3213 {
3214
3215 // Fail if at end of input
3216 if (fp->fInputIdx >= fActiveLimit) {
3217 fHitEnd = TRUE;
3218 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3219 break;
3220 }
3221
3222 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3223
3224 // Examine (and consume) the current char.
3225 // Dispatch into a little state machine, based on the char.
3226 UChar32 c;
3227 c = UTEXT_NEXT32(fInputText);
3228 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3229 UnicodeSet **sets = fPattern->fStaticSets;
3230 if (sets[URX_GC_NORMAL]->contains(c)) goto GC_Extend;
3231 if (sets[URX_GC_CONTROL]->contains(c)) goto GC_Control;
3232 if (sets[URX_GC_L]->contains(c)) goto GC_L;
3233 if (sets[URX_GC_LV]->contains(c)) goto GC_V;
3234 if (sets[URX_GC_LVT]->contains(c)) goto GC_T;
3235 if (sets[URX_GC_V]->contains(c)) goto GC_V;
3236 if (sets[URX_GC_T]->contains(c)) goto GC_T;
3237 goto GC_Extend;
3238
3239
3240
3241 GC_L:
3242 if (fp->fInputIdx >= fActiveLimit) goto GC_Done;
3243 c = UTEXT_NEXT32(fInputText);
3244 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3245 if (sets[URX_GC_L]->contains(c)) goto GC_L;
3246 if (sets[URX_GC_LV]->contains(c)) goto GC_V;
3247 if (sets[URX_GC_LVT]->contains(c)) goto GC_T;
3248 if (sets[URX_GC_V]->contains(c)) goto GC_V;
3249 UTEXT_PREVIOUS32(fInputText);
3250 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3251 goto GC_Extend;
3252
3253 GC_V:
3254 if (fp->fInputIdx >= fActiveLimit) goto GC_Done;
3255 c = UTEXT_NEXT32(fInputText);
3256 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3257 if (sets[URX_GC_V]->contains(c)) goto GC_V;
3258 if (sets[URX_GC_T]->contains(c)) goto GC_T;
3259 UTEXT_PREVIOUS32(fInputText);
3260 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3261 goto GC_Extend;
3262
3263 GC_T:
3264 if (fp->fInputIdx >= fActiveLimit) goto GC_Done;
3265 c = UTEXT_NEXT32(fInputText);
3266 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3267 if (sets[URX_GC_T]->contains(c)) goto GC_T;
3268 UTEXT_PREVIOUS32(fInputText);
3269 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3270 goto GC_Extend;
3271
3272 GC_Extend:
3273 // Combining characters are consumed here
3274 for (;;) {
3275 if (fp->fInputIdx >= fActiveLimit) {
3276 break;
3277 }
3278 c = UTEXT_CURRENT32(fInputText);
3279 if (sets[URX_GC_EXTEND]->contains(c) == FALSE) {
3280 break;
3281 }
3282 UTEXT_NEXT32(fInputText);
3283 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3284 }
3285 goto GC_Done;
3286
3287 GC_Control:
3288 // Most control chars stand alone (don't combine with combining chars),
3289 // except for that CR/LF sequence is a single grapheme cluster.
3290 if (c == 0x0d && fp->fInputIdx < fActiveLimit && UTEXT_CURRENT32(fInputText) == 0x0a) {
3291 c = UTEXT_NEXT32(fInputText);
3292 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3293 }
3294
3295 GC_Done:
3296 if (fp->fInputIdx >= fActiveLimit) {
3297 fHitEnd = TRUE;
3298 }
3299 break;
3300 }
3301
3302
3303
3304
3305 case URX_BACKSLASH_Z: // Test for end of Input
3306 if (fp->fInputIdx < fAnchorLimit) {
3307 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3308 } else {
3309 fHitEnd = TRUE;
3310 fRequireEnd = TRUE;
3311 }
3312 break;
3313
3314
3315
3316 case URX_STATIC_SETREF:
3317 {
3318 // Test input character against one of the predefined sets
3319 // (Word Characters, for example)
3320 // The high bit of the op value is a flag for the match polarity.
3321 // 0: success if input char is in set.
3322 // 1: success if input char is not in set.
3323 if (fp->fInputIdx >= fActiveLimit) {
3324 fHitEnd = TRUE;
3325 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3326 break;
3327 }
3328
3329 UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET);
3330 opValue &= ~URX_NEG_SET;
3331 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
3332
3333 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3334 UChar32 c = UTEXT_NEXT32(fInputText);
3335 if (c < 256) {
3336 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
3337 if (s8->contains(c)) {
3338 success = !success;
3339 }
3340 } else {
3341 const UnicodeSet *s = fPattern->fStaticSets[opValue];
3342 if (s->contains(c)) {
3343 success = !success;
3344 }
3345 }
3346 if (success) {
3347 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3348 } else {
3349 // the character wasn't in the set.
3350 #ifdef REGEX_SMART_BACKTRACKING
3351 if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) {
3352 REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
3353 if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
3354 // Try to find it, backwards
3355 UTEXT_PREVIOUS32(fInputText); // skip the first character we tried
3356 success = ((opValue & URX_NEG_SET) == URX_NEG_SET); // reset
3357 do {
3358 c = UTEXT_PREVIOUS32(fInputText);
3359 if (c == U_SENTINEL) {
3360 break;
3361 } else if (c < 256) {
3362 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
3363 if (s8->contains(c)) {
3364 success = !success;
3365 }
3366 } else {
3367 const UnicodeSet *s = fPattern->fStaticSets[opValue];
3368 if (s->contains(c)) {
3369 success = !success;
3370 }
3371 }
3372 } while (UTEXT_GETNATIVEINDEX(fInputText) >= backSearchIndex && !success);
3373
3374 if (success && c != U_SENTINEL) {
3375 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3376 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3377 if (fp->fInputIdx > backSearchIndex) {
3378 fp = StateSave(fp, fp->fPatIdx, status);
3379 }
3380 fp->fPatIdx++; // Skip the LOOP_C, we just did that
3381 break;
3382 }
3383 }
3384 }
3385 #endif
3386 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3387 }
3388 }
3389 break;
3390
3391
3392 case URX_STAT_SETREF_N:
3393 {
3394 // Test input character for NOT being a member of one of
3395 // the predefined sets (Word Characters, for example)
3396 if (fp->fInputIdx >= fActiveLimit) {
3397 fHitEnd = TRUE;
3398 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3399 break;
3400 }
3401
3402 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
3403
3404 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3405
3406 UChar32 c = UTEXT_NEXT32(fInputText);
3407 if (c < 256) {
3408 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
3409 if (s8->contains(c) == FALSE) {
3410 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3411 break;
3412 }
3413 } else {
3414 const UnicodeSet *s = fPattern->fStaticSets[opValue];
3415 if (s->contains(c) == FALSE) {
3416 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3417 break;
3418 }
3419 }
3420 // the character wasn't in the set.
3421 #ifdef REGEX_SMART_BACKTRACKING
3422 if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) {
3423 REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
3424 if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
3425 // Try to find it, backwards
3426 UTEXT_PREVIOUS32(fInputText); // skip the first character we tried
3427 UBool success = FALSE;
3428 do {
3429 c = UTEXT_PREVIOUS32(fInputText);
3430 if (c == U_SENTINEL) {
3431 break;
3432 } else if (c < 256) {
3433 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
3434 if (s8->contains(c) == FALSE) {
3435 success = TRUE;
3436 break;
3437 }
3438 } else {
3439 const UnicodeSet *s = fPattern->fStaticSets[opValue];
3440 if (s->contains(c) == FALSE) {
3441 success = TRUE;
3442 break;
3443 }
3444 }
3445 } while (UTEXT_GETNATIVEINDEX(fInputText) >= backSearchIndex);
3446
3447 if (success) {
3448 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3449 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3450 if (fp->fInputIdx > backSearchIndex) {
3451 fp = StateSave(fp, fp->fPatIdx, status);
3452 }
3453 fp->fPatIdx++; // Skip the LOOP_C, we just did that
3454 break;
3455 }
3456 }
3457 }
3458 #endif
3459 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3460 }
3461 break;
3462
3463
3464 case URX_SETREF:
3465 if (fp->fInputIdx >= fActiveLimit) {
3466 fHitEnd = TRUE;
3467 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3468 break;
3469 } else {
3470 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3471
3472 // There is input left. Pick up one char and test it for set membership.
3473 UChar32 c = UTEXT_NEXT32(fInputText);
3474 U_ASSERT(opValue > 0 && opValue < sets->size());
3475 if (c<256) {
3476 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
3477 if (s8->contains(c)) {
3478 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3479 break;
3480 }
3481 } else {
3482 UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue);
3483 if (s->contains(c)) {
3484 // The character is in the set. A Match.
3485 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3486 break;
3487 }
3488 }
3489
3490 // the character wasn't in the set.
3491 #ifdef REGEX_SMART_BACKTRACKING
3492 if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) {
3493 REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
3494 if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
3495 // Try to find it, backwards
3496 UTEXT_PREVIOUS32(fInputText); // skip the first character we tried
3497 UBool success = FALSE;
3498 do {
3499 c = UTEXT_PREVIOUS32(fInputText);
3500 if (c == U_SENTINEL) {
3501 break;
3502 } else if (c < 256) {
3503 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
3504 if (s8->contains(c)) {
3505 success = TRUE;
3506 break;
3507 }
3508 } else {
3509 UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue);
3510 if (s->contains(c)) {
3511 success = TRUE;
3512 break;
3513 }
3514 }
3515 } while (UTEXT_GETNATIVEINDEX(fInputText) >= backSearchIndex);
3516
3517 if (success) {
3518 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3519 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3520 if (fp->fInputIdx > backSearchIndex) {
3521 fp = StateSave(fp, fp->fPatIdx, status);
3522 }
3523 fp->fPatIdx++; // Skip the LOOP_C, we just did that
3524 break;
3525 }
3526 }
3527 }
3528 #endif
3529 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3530 }
3531 break;
3532
3533
3534 case URX_DOTANY:
3535 {
3536 // . matches anything, but stops at end-of-line.
3537 if (fp->fInputIdx >= fActiveLimit) {
3538 // At end of input. Match failed. Backtrack out.
3539 fHitEnd = TRUE;
3540 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3541 break;
3542 }
3543
3544 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3545
3546 // There is input left. Advance over one char, unless we've hit end-of-line
3547 UChar32 c = UTEXT_NEXT32(fInputText);
3548 if (((c & 0x7f) <= 0x29) && // First quickly bypass as many chars as possible
3549 ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) {
3550 // End of line in normal mode. . does not match.
3551 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3552 break;
3553 }
3554 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3555 }
3556 break;
3557
3558
3559 case URX_DOTANY_ALL:
3560 {
3561 // ., in dot-matches-all (including new lines) mode
3562 if (fp->fInputIdx >= fActiveLimit) {
3563 // At end of input. Match failed. Backtrack out.
3564 fHitEnd = TRUE;
3565 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3566 break;
3567 }
3568
3569 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3570
3571 // There is input left. Advance over one char, except if we are
3572 // at a cr/lf, advance over both of them.
3573 UChar32 c;
3574 c = UTEXT_NEXT32(fInputText);
3575 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3576 if (c==0x0d && fp->fInputIdx < fActiveLimit) {
3577 // In the case of a CR/LF, we need to advance over both.
3578 UChar32 nextc = UTEXT_CURRENT32(fInputText);
3579 if (nextc == 0x0a) {
3580 UTEXT_NEXT32(fInputText);
3581 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3582 }
3583 }
3584 }
3585 break;
3586
3587
3588 case URX_DOTANY_UNIX:
3589 {
3590 // '.' operator, matches all, but stops at end-of-line.
3591 // UNIX_LINES mode, so 0x0a is the only recognized line ending.
3592 if (fp->fInputIdx >= fActiveLimit) {
3593 // At end of input. Match failed. Backtrack out.
3594 fHitEnd = TRUE;
3595 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3596 break;
3597 }
3598
3599 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3600
3601 // There is input left. Advance over one char, unless we've hit end-of-line
3602 UChar32 c = UTEXT_NEXT32(fInputText);
3603 if (c == 0x0a) {
3604 // End of line in normal mode. '.' does not match the \n
3605 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3606 } else {
3607 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3608 }
3609 }
3610 break;
3611
3612
3613 case URX_JMP:
3614 fp->fPatIdx = opValue;
3615 break;
3616
3617 case URX_FAIL:
3618 isMatch = FALSE;
3619 goto breakFromLoop;
3620
3621 case URX_JMP_SAV:
3622 U_ASSERT(opValue < fPattern->fCompiledPat->size());
3623 fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current
3624 fp->fPatIdx = opValue; // Then JMP.
3625 break;
3626
3627 case URX_JMP_SAV_X:
3628 // This opcode is used with (x)+, when x can match a zero length string.
3629 // Same as JMP_SAV, except conditional on the match having made forward progress.
3630 // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the
3631 // data address of the input position at the start of the loop.
3632 {
3633 U_ASSERT(opValue > 0 && opValue < fPattern->fCompiledPat->size());
3634 int32_t stoOp = (int32_t)pat[opValue-1];
3635 U_ASSERT(URX_TYPE(stoOp) == URX_STO_INP_LOC);
3636 int32_t frameLoc = URX_VAL(stoOp);
3637 U_ASSERT(frameLoc >= 0 && frameLoc < fFrameSize);
3638 int64_t prevInputIdx = fp->fExtra[frameLoc];
3639 U_ASSERT(prevInputIdx <= fp->fInputIdx);
3640 if (prevInputIdx < fp->fInputIdx) {
3641 // The match did make progress. Repeat the loop.
3642 fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current
3643 fp->fPatIdx = opValue;
3644 fp->fExtra[frameLoc] = fp->fInputIdx;
3645 }
3646 // If the input position did not advance, we do nothing here,
3647 // execution will fall out of the loop.
3648 }
3649 break;
3650
3651 case URX_CTR_INIT:
3652 {
3653 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
3654 fp->fExtra[opValue] = 0; // Set the loop counter variable to zero
3655
3656 // Pick up the three extra operands that CTR_INIT has, and
3657 // skip the pattern location counter past
3658 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
3659 fp->fPatIdx += 3;
3660 int32_t loopLoc = URX_VAL(pat[instrOperandLoc]);
3661 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
3662 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
3663 U_ASSERT(minCount>=0);
3664 U_ASSERT(maxCount>=minCount || maxCount==-1);
3665 U_ASSERT(loopLoc>fp->fPatIdx);
3666
3667 if (minCount == 0) {
3668 fp = StateSave(fp, loopLoc+1, status);
3669 }
3670 if (maxCount == 0) {
3671 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3672 }
3673 }
3674 break;
3675
3676 case URX_CTR_LOOP:
3677 {
3678 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
3679 int32_t initOp = (int32_t)pat[opValue];
3680 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT);
3681 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
3682 int32_t minCount = (int32_t)pat[opValue+2];
3683 int32_t maxCount = (int32_t)pat[opValue+3];
3684 // Increment the counter. Note: we DIDN'T worry about counter
3685 // overflow, since the data comes from UnicodeStrings, which
3686 // stores its length in an int32_t. Do we have to think about
3687 // this now that we're using UText? Probably not, since the length
3688 // in UChar32s is still an int32_t.
3689 (*pCounter)++;
3690 U_ASSERT(*pCounter > 0);
3691 if ((uint64_t)*pCounter >= (uint32_t)maxCount) {
3692 U_ASSERT(*pCounter == maxCount || maxCount == -1);
3693 break;
3694 }
3695 if (*pCounter >= minCount) {
3696 fp = StateSave(fp, fp->fPatIdx, status);
3697 }
3698 fp->fPatIdx = opValue + 4; // Loop back.
3699 }
3700 break;
3701
3702 case URX_CTR_INIT_NG:
3703 {
3704 // Initialize a non-greedy loop
3705 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
3706 fp->fExtra[opValue] = 0; // Set the loop counter variable to zero
3707
3708 // Pick up the three extra operands that CTR_INIT has, and
3709 // skip the pattern location counter past
3710 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
3711 fp->fPatIdx += 3;
3712 int32_t loopLoc = URX_VAL(pat[instrOperandLoc]);
3713 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
3714 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
3715 U_ASSERT(minCount>=0);
3716 U_ASSERT(maxCount>=minCount || maxCount==-1);
3717 U_ASSERT(loopLoc>fp->fPatIdx);
3718
3719 if (minCount == 0) {
3720 if (maxCount != 0) {
3721 fp = StateSave(fp, fp->fPatIdx, status);
3722 }
3723 fp->fPatIdx = loopLoc+1; // Continue with stuff after repeated block
3724 }
3725 }
3726 break;
3727
3728 case URX_CTR_LOOP_NG:
3729 {
3730 // Non-greedy {min, max} loops
3731 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
3732 int32_t initOp = (int32_t)pat[opValue];
3733 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT_NG);
3734 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
3735 int32_t minCount = (int32_t)pat[opValue+2];
3736 int32_t maxCount = (int32_t)pat[opValue+3];
3737 // Increment the counter. Note: we DIDN'T worry about counter
3738 // overflow, since the data comes from UnicodeStrings, which
3739 // stores its length in an int32_t. Do we have to think about
3740 // this now that we're using UText? Probably not, since the length
3741 // in UChar32s is still an int32_t.
3742 (*pCounter)++;
3743 U_ASSERT(*pCounter > 0);
3744
3745 if ((uint64_t)*pCounter >= (uint32_t)maxCount) {
3746 // The loop has matched the maximum permitted number of times.
3747 // Break out of here with no action. Matching will
3748 // continue with the following pattern.
3749 U_ASSERT(*pCounter == maxCount || maxCount == -1);
3750 break;
3751 }
3752
3753 if (*pCounter < minCount) {
3754 // We haven't met the minimum number of matches yet.
3755 // Loop back for another one.
3756 fp->fPatIdx = opValue + 4; // Loop back.
3757 } else {
3758 // We do have the minimum number of matches.
3759 // Fall into the following pattern, but first do
3760 // a state save to the top of the loop, so that a failure
3761 // in the following pattern will try another iteration of the loop.
3762 fp = StateSave(fp, opValue + 4, status);
3763 }
3764 }
3765 break;
3766
3767 case URX_STO_SP:
3768 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
3769 fData[opValue] = fStack->size();
3770 break;
3771
3772 case URX_LD_SP:
3773 {
3774 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
3775 int32_t newStackSize = (int32_t)fData[opValue];
3776 U_ASSERT(newStackSize <= fStack->size());
3777 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
3778 if (newFP == (int64_t *)fp) {
3779 break;
3780 }
3781 int32_t i;
3782 for (i=0; i<fFrameSize; i++) {
3783 newFP[i] = ((int64_t *)fp)[i];
3784 }
3785 fp = (REStackFrame *)newFP;
3786 fStack->setSize(newStackSize);
3787 }
3788 break;
3789
3790 case URX_BACKREF:
3791 case URX_BACKREF_I:
3792 {
3793 U_ASSERT(opValue < fFrameSize);
3794 int64_t groupStartIdx = fp->fExtra[opValue];
3795 int64_t groupEndIdx = fp->fExtra[opValue+1];
3796 U_ASSERT(groupStartIdx <= groupEndIdx);
3797 if (groupStartIdx < 0) {
3798 // This capture group has not participated in the match thus far,
3799 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match.
3800 }
3801
3802 if (groupEndIdx == groupStartIdx) {
3803 // The capture group match was of an empty string.
3804 // Verified by testing: Perl matches succeed in this case, so
3805 // we do too.
3806 break;
3807 }
3808
3809 UTEXT_SETNATIVEINDEX(fAltInputText, groupStartIdx);
3810 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3811
3812 UBool haveMatch = (opType == URX_BACKREF ?
3813 (0 == utext_compareNativeLimit(fAltInputText, groupEndIdx, fInputText, -1)) :
3814 (0 == utext_caseCompareNativeLimit(fAltInputText, groupEndIdx, fInputText, -1, U_FOLD_CASE_DEFAULT, &status)));
3815 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3816
3817 if (fp->fInputIdx > fActiveLimit) {
3818 fHitEnd = TRUE;
3819 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match.
3820 } else if (!haveMatch) {
3821 if (fp->fInputIdx == fActiveLimit) {
3822 fHitEnd = TRUE;
3823 }
3824 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match.
3825 }
3826 }
3827 break;
3828
3829 case URX_STO_INP_LOC:
3830 {
3831 U_ASSERT(opValue >= 0 && opValue < fFrameSize);
3832 fp->fExtra[opValue] = fp->fInputIdx;
3833 }
3834 break;
3835
3836 case URX_JMPX:
3837 {
3838 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
3839 fp->fPatIdx += 1;
3840 int32_t dataLoc = URX_VAL(pat[instrOperandLoc]);
3841 U_ASSERT(dataLoc >= 0 && dataLoc < fFrameSize);
3842 int64_t savedInputIdx = fp->fExtra[dataLoc];
3843 U_ASSERT(savedInputIdx <= fp->fInputIdx);
3844 if (savedInputIdx < fp->fInputIdx) {
3845 fp->fPatIdx = opValue; // JMP
3846 } else {
3847 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no progress in loop.
3848 }
3849 }
3850 break;
3851
3852 case URX_LA_START:
3853 {
3854 // Entering a lookahead block.
3855 // Save Stack Ptr, Input Pos.
3856 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
3857 fData[opValue] = fStack->size();
3858 fData[opValue+1] = fp->fInputIdx;
3859 fActiveStart = fLookStart; // Set the match region change for
3860 fActiveLimit = fLookLimit; // transparent bounds.
3861 }
3862 break;
3863
3864 case URX_LA_END:
3865 {
3866 // Leaving a look-ahead block.
3867 // restore Stack Ptr, Input Pos to positions they had on entry to block.
3868 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
3869 int32_t stackSize = fStack->size();
3870 int32_t newStackSize =(int32_t)fData[opValue];
3871 U_ASSERT(stackSize >= newStackSize);
3872 if (stackSize > newStackSize) {
3873 // Copy the current top frame back to the new (cut back) top frame.
3874 // This makes the capture groups from within the look-ahead
3875 // expression available.
3876 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
3877 int32_t i;
3878 for (i=0; i<fFrameSize; i++) {
3879 newFP[i] = ((int64_t *)fp)[i];
3880 }
3881 fp = (REStackFrame *)newFP;
3882 fStack->setSize(newStackSize);
3883 }
3884 fp->fInputIdx = fData[opValue+1];
3885
3886 // Restore the active region bounds in the input string; they may have
3887 // been changed because of transparent bounds on a Region.
3888 fActiveStart = fRegionStart;
3889 fActiveLimit = fRegionLimit;
3890 }
3891 break;
3892
3893 case URX_ONECHAR_I:
3894 if (fp->fInputIdx < fActiveLimit) {
3895 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3896
3897 UChar32 c = UTEXT_NEXT32(fInputText);
3898 if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) {
3899 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3900 break;
3901 }
3902 } else {
3903 fHitEnd = TRUE;
3904 }
3905
3906 #ifdef REGEX_SMART_BACKTRACKING
3907 if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) {
3908 REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
3909 if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
3910 UBool success = FALSE;
3911 UChar32 c = UTEXT_PREVIOUS32(fInputText);
3912 while (UTEXT_GETNATIVEINDEX(fInputText) >= backSearchIndex) {
3913 if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) {
3914 success = TRUE;
3915 break;
3916 } else if (c == U_SENTINEL) {
3917 break;
3918 }
3919 c = UTEXT_PREVIOUS32(fInputText);
3920 }
3921 if (success) {
3922 fHitEnd = FALSE;
3923 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3924 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3925 if (fp->fInputIdx > backSearchIndex) {
3926 fp = StateSave(fp, fp->fPatIdx, status);
3927 }
3928 fp->fPatIdx++; // Skip the LOOP_C, we just did that
3929 break;
3930 }
3931 }
3932 }
3933 #endif
3934
3935 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3936 break;
3937
3938 case URX_STRING_I:
3939 {
3940 // Test input against a literal string.
3941 // Strings require two slots in the compiled pattern, one for the
3942 // offset to the string text, and one for the length.
3943 const UCaseProps *csp = ucase_getSingleton();
3944 {
3945 int32_t stringStartIdx, stringLen;
3946 stringStartIdx = opValue;
3947
3948 op = (int32_t)pat[fp->fPatIdx];
3949 fp->fPatIdx++;
3950 opType = URX_TYPE(op);
3951 opValue = URX_VAL(op);
3952 U_ASSERT(opType == URX_STRING_LEN);
3953 stringLen = opValue;
3954
3955 const UChar *patternChars = litText+stringStartIdx;
3956 const UChar *patternEnd = patternChars+stringLen;
3957
3958 const UChar *foldChars = NULL;
3959 int32_t foldOffset, foldLength;
3960 UChar32 c;
3961
3962 foldOffset = foldLength = 0;
3963 UBool success = TRUE;
3964
3965 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3966 while (patternChars < patternEnd && success) {
3967 if(foldOffset < foldLength) {
3968 U16_NEXT_UNSAFE(foldChars, foldOffset, c);
3969 } else {
3970 c = UTEXT_NEXT32(fInputText);
3971 if (c != U_SENTINEL) {
3972 foldLength = ucase_toFullFolding(csp, c, &foldChars, U_FOLD_CASE_DEFAULT);
3973 if(foldLength >= 0) {
3974 if(foldLength <= UCASE_MAX_STRING_LENGTH) { // !!!: Does not correctly handle chars that fold to 0-length strings
3975 foldOffset = 0;
3976 U16_NEXT_UNSAFE(foldChars, foldOffset, c);
3977 } else {
3978 c = foldLength;
3979 foldLength = foldOffset; // to avoid reading chars from the folding buffer
3980 }
3981 }
3982 }
3983
3984 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3985 }
3986
3987 success = FALSE;
3988 if (c != U_SENTINEL && (fp->fInputIdx <= fActiveLimit)) {
3989 if (U_IS_BMP(c)) {
3990 success = (*patternChars == c);
3991 patternChars += 1;
3992 } else if (patternChars+1 < patternEnd) {
3993 success = (*patternChars == U16_LEAD(c) && *(patternChars+1) == U16_TRAIL(c));
3994 patternChars += 2;
3995 }
3996 } else {
3997 fHitEnd = TRUE; // TODO: See ticket 6074
3998 }
3999 }
4000
4001 if (!success) {
4002 #ifdef REGEX_SMART_BACKTRACKING
4003 if (fp->fInputIdx > backSearchIndex && fStack->size()) {
4004 REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
4005 if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
4006 // Reset to last start point
4007 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
4008 patternChars = litText+stringStartIdx;
4009
4010 // Search backwards for a possible start
4011 do {
4012 c = UTEXT_PREVIOUS32(fInputText);
4013 if (c == U_SENTINEL) {
4014 break;
4015 } else {
4016 foldLength = ucase_toFullFolding(csp, c, &foldChars, U_FOLD_CASE_DEFAULT);
4017 if(foldLength >= 0) {
4018 if(foldLength <= UCASE_MAX_STRING_LENGTH) { // !!!: Does not correctly handle chars that fold to 0-length strings
4019 foldOffset = 0;
4020 U16_NEXT_UNSAFE(foldChars, foldOffset, c);
4021 } else {
4022 c = foldLength;
4023 foldLength = foldOffset; // to avoid reading chars from the folding buffer
4024 }
4025 }
4026
4027 if ((U_IS_BMP(c) && *patternChars == c) ||
4028 (*patternChars == U16_LEAD(c) && *(patternChars+1) == U16_TRAIL(c))) {
4029 success = TRUE;
4030 break;
4031 }
4032 }
4033 } while (UTEXT_GETNATIVEINDEX(fInputText) >= backSearchIndex);
4034
4035 // And try again
4036 if (success) {
4037 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4038 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
4039 if (fp->fInputIdx > backSearchIndex) {
4040 fp = StateSave(fp, fp->fPatIdx, status);
4041 }
4042 fp->fPatIdx++; // Skip the LOOP_C, we just did that
4043 break;
4044 }
4045 }
4046 }
4047 #endif
4048 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4049 }
4050 }
4051 }
4052 break;
4053
4054 case URX_LB_START:
4055 {
4056 // Entering a look-behind block.
4057 // Save Stack Ptr, Input Pos.
4058 // TODO: implement transparent bounds. Ticket #6067
4059 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
4060 fData[opValue] = fStack->size();
4061 fData[opValue+1] = fp->fInputIdx;
4062 // Init the variable containing the start index for attempted matches.
4063 fData[opValue+2] = -1;
4064 // Save input string length, then reset to pin any matches to end at
4065 // the current position.
4066 fData[opValue+3] = fActiveLimit;
4067 fActiveLimit = fp->fInputIdx;
4068 }
4069 break;
4070
4071
4072 case URX_LB_CONT:
4073 {
4074 // Positive Look-Behind, at top of loop checking for matches of LB expression
4075 // at all possible input starting positions.
4076
4077 // Fetch the min and max possible match lengths. They are the operands
4078 // of this op in the pattern.
4079 int32_t minML = (int32_t)pat[fp->fPatIdx++];
4080 int32_t maxML = (int32_t)pat[fp->fPatIdx++];
4081 U_ASSERT(minML <= maxML);
4082 U_ASSERT(minML >= 0);
4083
4084 // Fetch (from data) the last input index where a match was attempted.
4085 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
4086 int64_t *lbStartIdx = &fData[opValue+2];
4087 if (*lbStartIdx < 0) {
4088 // First time through loop.
4089 *lbStartIdx = fp->fInputIdx - minML;
4090 } else {
4091 // 2nd through nth time through the loop.
4092 // Back up start position for match by one.
4093 if (*lbStartIdx == 0) {
4094 (*lbStartIdx)--;
4095 } else {
4096 UTEXT_SETNATIVEINDEX(fInputText, *lbStartIdx);
4097 UTEXT_PREVIOUS32(fInputText);
4098 *lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
4099 }
4100 }
4101
4102 if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) {
4103 // We have tried all potential match starting points without
4104 // getting a match. Backtrack out, and out of the
4105 // Look Behind altogether.
4106 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4107 int64_t restoreInputLen = fData[opValue+3];
4108 U_ASSERT(restoreInputLen >= fActiveLimit);
4109 U_ASSERT(restoreInputLen <= fInputLength);
4110 fActiveLimit = restoreInputLen;
4111 break;
4112 }
4113
4114 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
4115 // (successful match will fall off the end of the loop.)
4116 fp = StateSave(fp, fp->fPatIdx-3, status);
4117 fp->fInputIdx = *lbStartIdx;
4118 }
4119 break;
4120
4121 case URX_LB_END:
4122 // End of a look-behind block, after a successful match.
4123 {
4124 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
4125 if (fp->fInputIdx != fActiveLimit) {
4126 // The look-behind expression matched, but the match did not
4127 // extend all the way to the point that we are looking behind from.
4128 // FAIL out of here, which will take us back to the LB_CONT, which
4129 // will retry the match starting at another position or fail
4130 // the look-behind altogether, whichever is appropriate.
4131 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4132 break;
4133 }
4134
4135 // Look-behind match is good. Restore the orignal input string length,
4136 // which had been truncated to pin the end of the lookbehind match to the
4137 // position being looked-behind.
4138 int64_t originalInputLen = fData[opValue+3];
4139 U_ASSERT(originalInputLen >= fActiveLimit);
4140 U_ASSERT(originalInputLen <= fInputLength);
4141 fActiveLimit = originalInputLen;
4142 }
4143 break;
4144
4145
4146 case URX_LBN_CONT:
4147 {
4148 // Negative Look-Behind, at top of loop checking for matches of LB expression
4149 // at all possible input starting positions.
4150
4151 // Fetch the extra parameters of this op.
4152 int32_t minML = (int32_t)pat[fp->fPatIdx++];
4153 int32_t maxML = (int32_t)pat[fp->fPatIdx++];
4154 int32_t continueLoc = (int32_t)pat[fp->fPatIdx++];
4155 continueLoc = URX_VAL(continueLoc);
4156 U_ASSERT(minML <= maxML);
4157 U_ASSERT(minML >= 0);
4158 U_ASSERT(continueLoc > fp->fPatIdx);
4159
4160 // Fetch (from data) the last input index where a match was attempted.
4161 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
4162 int64_t *lbStartIdx = &fData[opValue+2];
4163 if (*lbStartIdx < 0) {
4164 // First time through loop.
4165 *lbStartIdx = fp->fInputIdx - minML;
4166 } else {
4167 // 2nd through nth time through the loop.
4168 // Back up start position for match by one.
4169 if (*lbStartIdx == 0) {
4170 (*lbStartIdx)--;
4171 } else {
4172 UTEXT_SETNATIVEINDEX(fInputText, *lbStartIdx);
4173 UTEXT_PREVIOUS32(fInputText);
4174 *lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
4175 }
4176 }
4177
4178 if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) {
4179 // We have tried all potential match starting points without
4180 // getting a match, which means that the negative lookbehind as
4181 // a whole has succeeded. Jump forward to the continue location
4182 int64_t restoreInputLen = fData[opValue+3];
4183 U_ASSERT(restoreInputLen >= fActiveLimit);
4184 U_ASSERT(restoreInputLen <= fInputLength);
4185 fActiveLimit = restoreInputLen;
4186 fp->fPatIdx = continueLoc;
4187 break;
4188 }
4189
4190 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
4191 // (successful match will cause a FAIL out of the loop altogether.)
4192 fp = StateSave(fp, fp->fPatIdx-4, status);
4193 fp->fInputIdx = *lbStartIdx;
4194 }
4195 break;
4196
4197 case URX_LBN_END:
4198 // End of a negative look-behind block, after a successful match.
4199 {
4200 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
4201 if (fp->fInputIdx != fActiveLimit) {
4202 // The look-behind expression matched, but the match did not
4203 // extend all the way to the point that we are looking behind from.
4204 // FAIL out of here, which will take us back to the LB_CONT, which
4205 // will retry the match starting at another position or succeed
4206 // the look-behind altogether, whichever is appropriate.
4207 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4208 break;
4209 }
4210
4211 // Look-behind expression matched, which means look-behind test as
4212 // a whole Fails
4213
4214 // Restore the orignal input string length, which had been truncated
4215 // inorder to pin the end of the lookbehind match
4216 // to the position being looked-behind.
4217 int64_t originalInputLen = fData[opValue+3];
4218 U_ASSERT(originalInputLen >= fActiveLimit);
4219 U_ASSERT(originalInputLen <= fInputLength);
4220 fActiveLimit = originalInputLen;
4221
4222 // Restore original stack position, discarding any state saved
4223 // by the successful pattern match.
4224 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
4225 int32_t newStackSize = (int32_t)fData[opValue];
4226 U_ASSERT(fStack->size() > newStackSize);
4227 fStack->setSize(newStackSize);
4228
4229 // FAIL, which will take control back to someplace
4230 // prior to entering the look-behind test.
4231 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4232 }
4233 break;
4234
4235
4236 case URX_LOOP_SR_I:
4237 // Loop Initialization for the optimized implementation of
4238 // [some character set]*
4239 // This op scans through all matching input.
4240 // The following LOOP_C op emulates stack unwinding if the following pattern fails.
4241 {
4242 U_ASSERT(opValue > 0 && opValue < sets->size());
4243 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
4244 UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue);
4245
4246 // Loop through input, until either the input is exhausted or
4247 // we reach a character that is not a member of the set.
4248 int64_t ix = fp->fInputIdx;
4249 UTEXT_SETNATIVEINDEX(fInputText, ix);
4250 for (;;) {
4251 if (ix >= fActiveLimit) {
4252 fHitEnd = TRUE;
4253 break;
4254 }
4255 UChar32 c = UTEXT_NEXT32(fInputText);
4256 if (c<256) {
4257 if (s8->contains(c) == FALSE) {
4258 break;
4259 }
4260 } else {
4261 if (s->contains(c) == FALSE) {
4262 break;
4263 }
4264 }
4265 ix = UTEXT_GETNATIVEINDEX(fInputText);
4266 }
4267
4268 // If there were no matching characters, skip over the loop altogether.
4269 // The loop doesn't run at all, a * op always succeeds.
4270 if (ix == fp->fInputIdx) {
4271 fp->fPatIdx++; // skip the URX_LOOP_C op.
4272 break;
4273 }
4274
4275 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
4276 // must follow. It's operand is the stack location
4277 // that holds the starting input index for the match of this [set]*
4278 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
4279 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
4280 int32_t stackLoc = URX_VAL(loopcOp);
4281 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
4282 fp->fExtra[stackLoc] = fp->fInputIdx;
4283 #ifdef REGEX_SMART_BACKTRACKING
4284 backSearchIndex = fp->fInputIdx;
4285 #endif
4286 fp->fInputIdx = ix;
4287
4288 // Save State to the URX_LOOP_C op that follows this one,
4289 // so that match failures in the following code will return to there.
4290 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
4291 fp = StateSave(fp, fp->fPatIdx, status);
4292 fp->fPatIdx++;
4293 }
4294 break;
4295
4296
4297 case URX_LOOP_DOT_I:
4298 // Loop Initialization for the optimized implementation of .*
4299 // This op scans through all remaining input.
4300 // The following LOOP_C op emulates stack unwinding if the following pattern fails.
4301 {
4302 // Loop through input until the input is exhausted (we reach an end-of-line)
4303 // In DOTALL mode, we can just go straight to the end of the input.
4304 int64_t ix;
4305 if ((opValue & 1) == 1) {
4306 // Dot-matches-All mode. Jump straight to the end of the string.
4307 ix = fActiveLimit;
4308 fHitEnd = TRUE;
4309 } else {
4310 // NOT DOT ALL mode. Line endings do not match '.'
4311 // Scan forward until a line ending or end of input.
4312 ix = fp->fInputIdx;
4313 UTEXT_SETNATIVEINDEX(fInputText, ix);
4314 for (;;) {
4315 if (ix >= fActiveLimit) {
4316 fHitEnd = TRUE;
4317 break;
4318 }
4319 UChar32 c = UTEXT_NEXT32(fInputText);
4320 if ((c & 0x7f) <= 0x29) { // Fast filter of non-new-line-s
4321 if ((c == 0x0a) || // 0x0a is newline in both modes.
4322 (((opValue & 2) == 0) && // IF not UNIX_LINES mode
4323 (c<=0x0d && c>=0x0a)) || c==0x85 ||c==0x2028 || c==0x2029) {
4324 // char is a line ending. Exit the scanning loop.
4325 break;
4326 }
4327 }
4328 ix = UTEXT_GETNATIVEINDEX(fInputText);
4329 }
4330 }
4331
4332 // If there were no matching characters, skip over the loop altogether.
4333 // The loop doesn't run at all, a * op always succeeds.
4334 if (ix == fp->fInputIdx) {
4335 fp->fPatIdx++; // skip the URX_LOOP_C op.
4336 break;
4337 }
4338
4339 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
4340 // must follow. It's operand is the stack location
4341 // that holds the starting input index for the match of this .*
4342 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
4343 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
4344 int32_t stackLoc = URX_VAL(loopcOp);
4345 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
4346 fp->fExtra[stackLoc] = fp->fInputIdx;
4347 #ifdef REGEX_SMART_BACKTRACKING
4348 backSearchIndex = fp->fInputIdx;
4349 #endif
4350 fp->fInputIdx = ix;
4351
4352 // Save State to the URX_LOOP_C op that follows this one,
4353 // so that match failures in the following code will return to there.
4354 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
4355 fp = StateSave(fp, fp->fPatIdx, status);
4356 fp->fPatIdx++;
4357 }
4358 break;
4359
4360
4361 case URX_LOOP_C:
4362 {
4363 U_ASSERT(opValue>=0 && opValue<fFrameSize);
4364 backSearchIndex = fp->fExtra[opValue];
4365 U_ASSERT(backSearchIndex <= fp->fInputIdx);
4366 if (backSearchIndex == fp->fInputIdx) {
4367 // We've backed up the input idx to the point that the loop started.
4368 // The loop is done. Leave here without saving state.
4369 // Subsequent failures won't come back here.
4370 break;
4371 }
4372 // Set up for the next iteration of the loop, with input index
4373 // backed up by one from the last time through,
4374 // and a state save to this instruction in case the following code fails again.
4375 // (We're going backwards because this loop emulates stack unwinding, not
4376 // the initial scan forward.)
4377 U_ASSERT(fp->fInputIdx > 0);
4378 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
4379 UChar32 prevC = UTEXT_PREVIOUS32(fInputText);
4380 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
4381
4382 UChar32 twoPrevC = UTEXT_PREVIOUS32(fInputText);
4383 if (prevC == 0x0a &&
4384 fp->fInputIdx > backSearchIndex &&
4385 twoPrevC == 0x0d) {
4386 int32_t prevOp = (int32_t)pat[fp->fPatIdx-2];
4387 if (URX_TYPE(prevOp) == URX_LOOP_DOT_I) {
4388 // .*, stepping back over CRLF pair.
4389 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
4390 }
4391 }
4392
4393
4394 fp = StateSave(fp, fp->fPatIdx-1, status);
4395 }
4396 break;
4397
4398
4399
4400 default:
4401 // Trouble. The compiled pattern contains an entry with an
4402 // unrecognized type tag.
4403 U_ASSERT(FALSE);
4404 }
4405
4406 if (U_FAILURE(status)) {
4407 isMatch = FALSE;
4408 break;
4409 }
4410 }
4411
4412 breakFromLoop:
4413 fMatch = isMatch;
4414 if (isMatch) {
4415 fLastMatchEnd = fMatchEnd;
4416 fMatchStart = startIdx;
4417 fMatchEnd = fp->fInputIdx;
4418 if (fTraceDebug) {
4419 REGEX_RUN_DEBUG_PRINTF(("Match. start=%d end=%d\n\n", fMatchStart, fMatchEnd));
4420 }
4421 }
4422 else
4423 {
4424 if (fTraceDebug) {
4425 REGEX_RUN_DEBUG_PRINTF(("No match\n\n"));
4426 }
4427 }
4428
4429 fFrame = fp; // The active stack frame when the engine stopped.
4430 // Contains the capture group results that we need to
4431 // access later.
4432 return;
4433 }
4434
4435
4436 //--------------------------------------------------------------------------------
4437 //
4438 // MatchChunkAt This is the actual matching engine. Like MatchAt, but with the
4439 // assumption that the entire string is available in the UText's
4440 // chunk buffer. For now, that means we can use int32_t indexes,
4441 // except for anything that needs to be saved (like group starts
4442 // and ends).
4443 //
4444 // startIdx: begin matching a this index.
4445 // toEnd: if true, match must extend to end of the input region
4446 //
4447 //--------------------------------------------------------------------------------
MatchChunkAt(int32_t startIdx,UBool toEnd,UErrorCode & status)4448 void RegexMatcher::MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &status) {
4449 UBool isMatch = FALSE; // True if the we have a match.
4450
4451 int32_t backSearchIndex = INT32_MAX; // used after greedy single-character matches for searching backwards
4452
4453 int32_t op; // Operation from the compiled pattern, split into
4454 int32_t opType; // the opcode
4455 int32_t opValue; // and the operand value.
4456
4457 #ifdef REGEX_RUN_DEBUG
4458 if (fTraceDebug)
4459 {
4460 printf("MatchAt(startIdx=%ld)\n", startIdx);
4461 printf("Original Pattern: ");
4462 UChar32 c = utext_next32From(fPattern->fPattern, 0);
4463 while (c != U_SENTINEL) {
4464 if (c<32 || c>256) {
4465 c = '.';
4466 }
4467 REGEX_DUMP_DEBUG_PRINTF(("%c", c));
4468
4469 c = UTEXT_NEXT32(fPattern->fPattern);
4470 }
4471 printf("\n");
4472 printf("Input String: ");
4473 c = utext_next32From(fInputText, 0);
4474 while (c != U_SENTINEL) {
4475 if (c<32 || c>256) {
4476 c = '.';
4477 }
4478 printf("%c", c);
4479
4480 c = UTEXT_NEXT32(fInputText);
4481 }
4482 printf("\n");
4483 printf("\n");
4484 }
4485 #endif
4486
4487 if (U_FAILURE(status)) {
4488 return;
4489 }
4490
4491 // Cache frequently referenced items from the compiled pattern
4492 //
4493 int64_t *pat = fPattern->fCompiledPat->getBuffer();
4494
4495 const UChar *litText = fPattern->fLiteralText.getBuffer();
4496 UVector *sets = fPattern->fSets;
4497
4498 const UChar *inputBuf = fInputText->chunkContents;
4499
4500 fFrameSize = fPattern->fFrameSize;
4501 REStackFrame *fp = resetStack();
4502
4503 fp->fPatIdx = 0;
4504 fp->fInputIdx = startIdx;
4505
4506 // Zero out the pattern's static data
4507 int32_t i;
4508 for (i = 0; i<fPattern->fDataSize; i++) {
4509 fData[i] = 0;
4510 }
4511
4512 //
4513 // Main loop for interpreting the compiled pattern.
4514 // One iteration of the loop per pattern operation performed.
4515 //
4516 for (;;) {
4517 #if 0
4518 if (_heapchk() != _HEAPOK) {
4519 fprintf(stderr, "Heap Trouble\n");
4520 }
4521 #endif
4522
4523 op = (int32_t)pat[fp->fPatIdx];
4524 opType = URX_TYPE(op);
4525 opValue = URX_VAL(op);
4526 #ifdef REGEX_RUN_DEBUG
4527 if (fTraceDebug) {
4528 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
4529 printf("inputIdx=%d inputChar=%x sp=%3d activeLimit=%d ", fp->fInputIdx,
4530 UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(), fActiveLimit);
4531 fPattern->dumpOp(fp->fPatIdx);
4532 }
4533 #endif
4534 fp->fPatIdx++;
4535
4536 switch (opType) {
4537
4538
4539 case URX_NOP:
4540 break;
4541
4542
4543 case URX_BACKTRACK:
4544 // Force a backtrack. In some circumstances, the pattern compiler
4545 // will notice that the pattern can't possibly match anything, and will
4546 // emit one of these at that point.
4547 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4548 break;
4549
4550
4551 case URX_ONECHAR:
4552 if (fp->fInputIdx < fActiveLimit) {
4553 UChar32 c;
4554 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4555 if (c == opValue) {
4556 break;
4557 }
4558 } else {
4559 fHitEnd = TRUE;
4560 }
4561
4562 #ifdef REGEX_SMART_BACKTRACKING
4563 if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) {
4564 REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
4565 if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
4566 int64_t reverseIndex = fp->fInputIdx;
4567 UChar32 c;
4568 do {
4569 U16_PREV(inputBuf, backSearchIndex, reverseIndex, c);
4570 if (c == opValue) {
4571 break;
4572 }
4573 } while (reverseIndex > backSearchIndex);
4574 if (c == opValue) {
4575 fHitEnd = FALSE;
4576 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4577 fp->fInputIdx = reverseIndex;
4578 if (fp->fInputIdx > backSearchIndex) {
4579 fp = StateSave(fp, fp->fPatIdx, status);
4580 }
4581 fp->fPatIdx++; // Skip the LOOP_C, we just did that
4582 break;
4583 }
4584 }
4585 }
4586 #endif
4587
4588 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4589 break;
4590
4591
4592 case URX_STRING:
4593 {
4594 // Test input against a literal string.
4595 // Strings require two slots in the compiled pattern, one for the
4596 // offset to the string text, and one for the length.
4597 int32_t stringStartIdx = opValue;
4598 int32_t stringLen;
4599
4600 op = (int32_t)pat[fp->fPatIdx]; // Fetch the second operand
4601 fp->fPatIdx++;
4602 opType = URX_TYPE(op);
4603 stringLen = URX_VAL(op);
4604 U_ASSERT(opType == URX_STRING_LEN);
4605 U_ASSERT(stringLen >= 2);
4606
4607 if (fp->fInputIdx + stringLen > fActiveLimit) {
4608 // No match. String is longer than the remaining input text.
4609 fHitEnd = TRUE; // TODO: See ticket 6074
4610 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4611 break;
4612 }
4613
4614 const UChar * pInp = inputBuf + fp->fInputIdx;
4615 const UChar * pPat = litText+stringStartIdx;
4616 const UChar * pEnd = pInp + stringLen;
4617 UBool success = FALSE;
4618 for(;;) {
4619 if (*pInp == *pPat) {
4620 pInp++;
4621 pPat++;
4622 if (pInp == pEnd) {
4623 // Successful Match.
4624 success = TRUE;
4625 break;
4626 }
4627 } else {
4628 // Match failed.
4629 break;
4630 }
4631 }
4632
4633 if (success) {
4634 fp->fInputIdx += stringLen;
4635 } else {
4636 #ifdef REGEX_SMART_BACKTRACKING
4637 if (fp->fInputIdx > backSearchIndex && fStack->size()) {
4638 REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
4639 if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
4640 // Reset to last start point
4641 int64_t reverseIndex = fp->fInputIdx;
4642 UChar32 c;
4643 pPat = litText+stringStartIdx;
4644
4645 // Search backwards for a possible start
4646 do {
4647 U16_PREV(inputBuf, backSearchIndex, reverseIndex, c);
4648 if ((U_IS_BMP(c) && *pPat == c) ||
4649 (*pPat == U16_LEAD(c) && *(pPat+1) == U16_TRAIL(c))) {
4650 success = TRUE;
4651 break;
4652 }
4653 } while (reverseIndex > backSearchIndex);
4654
4655 // And try again
4656 if (success) {
4657 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4658 fp->fInputIdx = reverseIndex;
4659 if (fp->fInputIdx > backSearchIndex) {
4660 fp = StateSave(fp, fp->fPatIdx, status);
4661 }
4662 fp->fPatIdx++; // Skip the LOOP_C, we just did that
4663 break;
4664 }
4665 }
4666 }
4667 #endif
4668 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4669 }
4670 }
4671 break;
4672
4673
4674 case URX_STATE_SAVE:
4675 fp = StateSave(fp, opValue, status);
4676 break;
4677
4678
4679 case URX_END:
4680 // The match loop will exit via this path on a successful match,
4681 // when we reach the end of the pattern.
4682 if (toEnd && fp->fInputIdx != fActiveLimit) {
4683 // The pattern matched, but not to the end of input. Try some more.
4684 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4685 break;
4686 }
4687 isMatch = TRUE;
4688 goto breakFromLoop;
4689
4690 // Start and End Capture stack frame variables are laid out out like this:
4691 // fp->fExtra[opValue] - The start of a completed capture group
4692 // opValue+1 - The end of a completed capture group
4693 // opValue+2 - the start of a capture group whose end
4694 // has not yet been reached (and might not ever be).
4695 case URX_START_CAPTURE:
4696 U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
4697 fp->fExtra[opValue+2] = fp->fInputIdx;
4698 break;
4699
4700
4701 case URX_END_CAPTURE:
4702 U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
4703 U_ASSERT(fp->fExtra[opValue+2] >= 0); // Start pos for this group must be set.
4704 fp->fExtra[opValue] = fp->fExtra[opValue+2]; // Tentative start becomes real.
4705 fp->fExtra[opValue+1] = fp->fInputIdx; // End position
4706 U_ASSERT(fp->fExtra[opValue] <= fp->fExtra[opValue+1]);
4707 break;
4708
4709
4710 case URX_DOLLAR: // $, test for End of line
4711 // or for position before new line at end of input
4712 if (fp->fInputIdx < fAnchorLimit-2) {
4713 // We are no where near the end of input. Fail.
4714 // This is the common case. Keep it first.
4715 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4716 break;
4717 }
4718 if (fp->fInputIdx >= fAnchorLimit) {
4719 // We really are at the end of input. Success.
4720 fHitEnd = TRUE;
4721 fRequireEnd = TRUE;
4722 break;
4723 }
4724
4725 // If we are positioned just before a new-line that is located at the
4726 // end of input, succeed.
4727 if (fp->fInputIdx == fAnchorLimit-1) {
4728 UChar32 c;
4729 U16_GET(inputBuf, fAnchorStart, fp->fInputIdx, fAnchorLimit, c);
4730
4731 if ((c>=0x0a && c<=0x0d) || c==0x85 || c==0x2028 || c==0x2029) {
4732 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp->fInputIdx-1]==0x0d)) {
4733 // At new-line at end of input. Success
4734 fHitEnd = TRUE;
4735 fRequireEnd = TRUE;
4736 break;
4737 }
4738 }
4739 } else if (fp->fInputIdx == fAnchorLimit-2 &&
4740 inputBuf[fp->fInputIdx]==0x0d && inputBuf[fp->fInputIdx+1]==0x0a) {
4741 fHitEnd = TRUE;
4742 fRequireEnd = TRUE;
4743 break; // At CR/LF at end of input. Success
4744 }
4745
4746 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4747
4748 break;
4749
4750
4751 case URX_DOLLAR_D: // $, test for End of Line, in UNIX_LINES mode.
4752 if (fp->fInputIdx >= fAnchorLimit-1) {
4753 // Either at the last character of input, or off the end.
4754 if (fp->fInputIdx == fAnchorLimit-1) {
4755 // At last char of input. Success if it's a new line.
4756 if (inputBuf[fp->fInputIdx] == 0x0a) {
4757 fHitEnd = TRUE;
4758 fRequireEnd = TRUE;
4759 break;
4760 }
4761 } else {
4762 // Off the end of input. Success.
4763 fHitEnd = TRUE;
4764 fRequireEnd = TRUE;
4765 break;
4766 }
4767 }
4768
4769 // Not at end of input. Back-track out.
4770 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4771 break;
4772
4773
4774 case URX_DOLLAR_M: // $, test for End of line in multi-line mode
4775 {
4776 if (fp->fInputIdx >= fAnchorLimit) {
4777 // We really are at the end of input. Success.
4778 fHitEnd = TRUE;
4779 fRequireEnd = TRUE;
4780 break;
4781 }
4782 // If we are positioned just before a new-line, succeed.
4783 // It makes no difference where the new-line is within the input.
4784 UChar32 c = inputBuf[fp->fInputIdx];
4785 if ((c>=0x0a && c<=0x0d) || c==0x85 ||c==0x2028 || c==0x2029) {
4786 // At a line end, except for the odd chance of being in the middle of a CR/LF sequence
4787 // In multi-line mode, hitting a new-line just before the end of input does not
4788 // set the hitEnd or requireEnd flags
4789 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp->fInputIdx-1]==0x0d)) {
4790 break;
4791 }
4792 }
4793 // not at a new line. Fail.
4794 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4795 }
4796 break;
4797
4798
4799 case URX_DOLLAR_MD: // $, test for End of line in multi-line and UNIX_LINES mode
4800 {
4801 if (fp->fInputIdx >= fAnchorLimit) {
4802 // We really are at the end of input. Success.
4803 fHitEnd = TRUE;
4804 fRequireEnd = TRUE; // Java set requireEnd in this case, even though
4805 break; // adding a new-line would not lose the match.
4806 }
4807 // If we are not positioned just before a new-line, the test fails; backtrack out.
4808 // It makes no difference where the new-line is within the input.
4809 if (inputBuf[fp->fInputIdx] != 0x0a) {
4810 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4811 }
4812 }
4813 break;
4814
4815
4816 case URX_CARET: // ^, test for start of line
4817 if (fp->fInputIdx != fAnchorStart) {
4818 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4819 }
4820 break;
4821
4822
4823 case URX_CARET_M: // ^, test for start of line in mulit-line mode
4824 {
4825 if (fp->fInputIdx == fAnchorStart) {
4826 // We are at the start input. Success.
4827 break;
4828 }
4829 // Check whether character just before the current pos is a new-line
4830 // unless we are at the end of input
4831 UChar c = inputBuf[fp->fInputIdx - 1];
4832 if ((fp->fInputIdx < fAnchorLimit) &&
4833 ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) {
4834 // It's a new-line. ^ is true. Success.
4835 // TODO: what should be done with positions between a CR and LF?
4836 break;
4837 }
4838 // Not at the start of a line. Fail.
4839 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4840 }
4841 break;
4842
4843
4844 case URX_CARET_M_UNIX: // ^, test for start of line in mulit-line + Unix-line mode
4845 {
4846 U_ASSERT(fp->fInputIdx >= fAnchorStart);
4847 if (fp->fInputIdx <= fAnchorStart) {
4848 // We are at the start input. Success.
4849 break;
4850 }
4851 // Check whether character just before the current pos is a new-line
4852 U_ASSERT(fp->fInputIdx <= fAnchorLimit);
4853 UChar c = inputBuf[fp->fInputIdx - 1];
4854 if (c != 0x0a) {
4855 // Not at the start of a line. Back-track out.
4856 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4857 }
4858 }
4859 break;
4860
4861 case URX_BACKSLASH_B: // Test for word boundaries
4862 {
4863 UBool success = isChunkWordBoundary((int32_t)fp->fInputIdx);
4864 success ^= (opValue != 0); // flip sense for \B
4865 if (!success) {
4866 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4867 }
4868 }
4869 break;
4870
4871
4872 case URX_BACKSLASH_BU: // Test for word boundaries, Unicode-style
4873 {
4874 UBool success = isUWordBoundary(fp->fInputIdx);
4875 success ^= (opValue != 0); // flip sense for \B
4876 if (!success) {
4877 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4878 }
4879 }
4880 break;
4881
4882
4883 case URX_BACKSLASH_D: // Test for decimal digit
4884 {
4885 if (fp->fInputIdx >= fActiveLimit) {
4886 fHitEnd = TRUE;
4887 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4888 break;
4889 }
4890
4891 UChar32 c;
4892 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4893 int8_t ctype = u_charType(c); // TODO: make a unicode set for this. Will be faster.
4894 UBool success = (ctype == U_DECIMAL_DIGIT_NUMBER);
4895 success ^= (opValue != 0); // flip sense for \D
4896 if (!success) {
4897 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4898 }
4899 }
4900 break;
4901
4902
4903 case URX_BACKSLASH_G: // Test for position at end of previous match
4904 if (!((fMatch && fp->fInputIdx==fMatchEnd) || (fMatch==FALSE && fp->fInputIdx==fActiveStart))) {
4905 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4906 }
4907 break;
4908
4909
4910 case URX_BACKSLASH_X:
4911 // Match a Grapheme, as defined by Unicode TR 29.
4912 // Differs slightly from Perl, which consumes combining marks independently
4913 // of context.
4914 {
4915
4916 // Fail if at end of input
4917 if (fp->fInputIdx >= fActiveLimit) {
4918 fHitEnd = TRUE;
4919 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4920 break;
4921 }
4922
4923 // Examine (and consume) the current char.
4924 // Dispatch into a little state machine, based on the char.
4925 UChar32 c;
4926 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4927 UnicodeSet **sets = fPattern->fStaticSets;
4928 if (sets[URX_GC_NORMAL]->contains(c)) goto GC_Extend;
4929 if (sets[URX_GC_CONTROL]->contains(c)) goto GC_Control;
4930 if (sets[URX_GC_L]->contains(c)) goto GC_L;
4931 if (sets[URX_GC_LV]->contains(c)) goto GC_V;
4932 if (sets[URX_GC_LVT]->contains(c)) goto GC_T;
4933 if (sets[URX_GC_V]->contains(c)) goto GC_V;
4934 if (sets[URX_GC_T]->contains(c)) goto GC_T;
4935 goto GC_Extend;
4936
4937
4938
4939 GC_L:
4940 if (fp->fInputIdx >= fActiveLimit) goto GC_Done;
4941 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4942 if (sets[URX_GC_L]->contains(c)) goto GC_L;
4943 if (sets[URX_GC_LV]->contains(c)) goto GC_V;
4944 if (sets[URX_GC_LVT]->contains(c)) goto GC_T;
4945 if (sets[URX_GC_V]->contains(c)) goto GC_V;
4946 U16_PREV(inputBuf, 0, fp->fInputIdx, c);
4947 goto GC_Extend;
4948
4949 GC_V:
4950 if (fp->fInputIdx >= fActiveLimit) goto GC_Done;
4951 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4952 if (sets[URX_GC_V]->contains(c)) goto GC_V;
4953 if (sets[URX_GC_T]->contains(c)) goto GC_T;
4954 U16_PREV(inputBuf, 0, fp->fInputIdx, c);
4955 goto GC_Extend;
4956
4957 GC_T:
4958 if (fp->fInputIdx >= fActiveLimit) goto GC_Done;
4959 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4960 if (sets[URX_GC_T]->contains(c)) goto GC_T;
4961 U16_PREV(inputBuf, 0, fp->fInputIdx, c);
4962 goto GC_Extend;
4963
4964 GC_Extend:
4965 // Combining characters are consumed here
4966 for (;;) {
4967 if (fp->fInputIdx >= fActiveLimit) {
4968 break;
4969 }
4970 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4971 if (sets[URX_GC_EXTEND]->contains(c) == FALSE) {
4972 U16_BACK_1(inputBuf, 0, fp->fInputIdx);
4973 break;
4974 }
4975 }
4976 goto GC_Done;
4977
4978 GC_Control:
4979 // Most control chars stand alone (don't combine with combining chars),
4980 // except for that CR/LF sequence is a single grapheme cluster.
4981 if (c == 0x0d && fp->fInputIdx < fActiveLimit && inputBuf[fp->fInputIdx] == 0x0a) {
4982 fp->fInputIdx++;
4983 }
4984
4985 GC_Done:
4986 if (fp->fInputIdx >= fActiveLimit) {
4987 fHitEnd = TRUE;
4988 }
4989 break;
4990 }
4991
4992
4993
4994
4995 case URX_BACKSLASH_Z: // Test for end of Input
4996 if (fp->fInputIdx < fAnchorLimit) {
4997 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4998 } else {
4999 fHitEnd = TRUE;
5000 fRequireEnd = TRUE;
5001 }
5002 break;
5003
5004
5005
5006 case URX_STATIC_SETREF:
5007 {
5008 // Test input character against one of the predefined sets
5009 // (Word Characters, for example)
5010 // The high bit of the op value is a flag for the match polarity.
5011 // 0: success if input char is in set.
5012 // 1: success if input char is not in set.
5013 if (fp->fInputIdx >= fActiveLimit) {
5014 fHitEnd = TRUE;
5015 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5016 break;
5017 }
5018
5019 UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET);
5020 opValue &= ~URX_NEG_SET;
5021 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
5022
5023 UChar32 c;
5024 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
5025 if (c < 256) {
5026 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
5027 if (s8->contains(c)) {
5028 success = !success;
5029 }
5030 } else {
5031 const UnicodeSet *s = fPattern->fStaticSets[opValue];
5032 if (s->contains(c)) {
5033 success = !success;
5034 }
5035 }
5036 if (!success) {
5037 #ifdef REGEX_SMART_BACKTRACKING
5038 if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) {
5039 REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
5040 if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
5041 // Try to find it, backwards
5042 int64_t reverseIndex = fp->fInputIdx;
5043 U16_BACK_1(inputBuf, backSearchIndex, reverseIndex); // skip the first character we tried
5044 success = ((opValue & URX_NEG_SET) == URX_NEG_SET); // reset
5045 do {
5046 U16_PREV(inputBuf, backSearchIndex, reverseIndex, c);
5047 if (c < 256) {
5048 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
5049 if (s8->contains(c)) {
5050 success = !success;
5051 }
5052 } else {
5053 const UnicodeSet *s = fPattern->fStaticSets[opValue];
5054 if (s->contains(c)) {
5055 success = !success;
5056 }
5057 }
5058 } while (reverseIndex > backSearchIndex && !success);
5059
5060 if (success) {
5061 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5062 fp->fInputIdx = reverseIndex;
5063 if (fp->fInputIdx > backSearchIndex) {
5064 fp = StateSave(fp, fp->fPatIdx, status);
5065 }
5066 fp->fPatIdx++; // Skip the LOOP_C, we just did that
5067 break;
5068 }
5069 }
5070 }
5071 #endif
5072 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5073 }
5074 }
5075 break;
5076
5077
5078 case URX_STAT_SETREF_N:
5079 {
5080 // Test input character for NOT being a member of one of
5081 // the predefined sets (Word Characters, for example)
5082 if (fp->fInputIdx >= fActiveLimit) {
5083 fHitEnd = TRUE;
5084 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5085 break;
5086 }
5087
5088 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
5089
5090 UChar32 c;
5091 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
5092 if (c < 256) {
5093 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
5094 if (s8->contains(c) == FALSE) {
5095 break;
5096 }
5097 } else {
5098 const UnicodeSet *s = fPattern->fStaticSets[opValue];
5099 if (s->contains(c) == FALSE) {
5100 break;
5101 }
5102 }
5103
5104 #ifdef REGEX_SMART_BACKTRACKING
5105 if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) {
5106 REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
5107 if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
5108 // Try to find it, backwards
5109 int64_t reverseIndex = fp->fInputIdx;
5110 U16_BACK_1(inputBuf, backSearchIndex, reverseIndex); // skip the first character we tried
5111 UBool success = FALSE;
5112 do {
5113 U16_PREV(inputBuf, backSearchIndex, reverseIndex, c);
5114 if (c < 256) {
5115 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
5116 if (s8->contains(c) == FALSE) {
5117 success = TRUE;
5118 break;
5119 }
5120 } else {
5121 const UnicodeSet *s = fPattern->fStaticSets[opValue];
5122 if (s->contains(c) == FALSE) {
5123 success = TRUE;
5124 break;
5125 }
5126 }
5127 } while (reverseIndex > backSearchIndex);
5128
5129 if (success) {
5130 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5131 fp->fInputIdx = reverseIndex;
5132 if (fp->fInputIdx > backSearchIndex) {
5133 fp = StateSave(fp, fp->fPatIdx, status);
5134 }
5135 fp->fPatIdx++; // Skip the LOOP_C, we just did that
5136 break;
5137 }
5138 }
5139 }
5140 #endif
5141 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5142 }
5143 break;
5144
5145
5146 case URX_SETREF:
5147 {
5148 if (fp->fInputIdx >= fActiveLimit) {
5149 fHitEnd = TRUE;
5150 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5151 break;
5152 }
5153
5154 U_ASSERT(opValue > 0 && opValue < sets->size());
5155
5156 // There is input left. Pick up one char and test it for set membership.
5157 UChar32 c;
5158 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
5159 if (c<256) {
5160 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
5161 if (s8->contains(c)) {
5162 // The character is in the set. A Match.
5163 break;
5164 }
5165 } else {
5166 UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue);
5167 if (s->contains(c)) {
5168 // The character is in the set. A Match.
5169 break;
5170 }
5171 }
5172
5173 // the character wasn't in the set.
5174 #ifdef REGEX_SMART_BACKTRACKING
5175 if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) {
5176 REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
5177 if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
5178 // Try to find it, backwards
5179 int64_t reverseIndex = fp->fInputIdx;
5180 U16_BACK_1(inputBuf, backSearchIndex, reverseIndex); // skip the first character we tried
5181 UBool success = FALSE;
5182 do {
5183 U16_PREV(inputBuf, backSearchIndex, reverseIndex, c);
5184 if (c < 256) {
5185 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
5186 if (s8->contains(c)) {
5187 success = TRUE;
5188 break;
5189 }
5190 } else {
5191 UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue);
5192 if (s->contains(c)) {
5193 success = TRUE;
5194 break;
5195 }
5196 }
5197 } while (reverseIndex > backSearchIndex);
5198
5199 if (success) {
5200 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5201 fp->fInputIdx = reverseIndex;
5202 if (fp->fInputIdx > reverseIndex) {
5203 fp = StateSave(fp, fp->fPatIdx, status);
5204 }
5205 fp->fPatIdx++; // Skip the LOOP_C, we just did that
5206 break;
5207 }
5208 }
5209 }
5210 #endif
5211 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5212 }
5213 break;
5214
5215
5216 case URX_DOTANY:
5217 {
5218 // . matches anything, but stops at end-of-line.
5219 if (fp->fInputIdx >= fActiveLimit) {
5220 // At end of input. Match failed. Backtrack out.
5221 fHitEnd = TRUE;
5222 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5223 break;
5224 }
5225
5226 // There is input left. Advance over one char, unless we've hit end-of-line
5227 UChar32 c;
5228 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
5229 if (((c & 0x7f) <= 0x29) && // First quickly bypass as many chars as possible
5230 ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) {
5231 // End of line in normal mode. . does not match.
5232 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5233 break;
5234 }
5235 }
5236 break;
5237
5238
5239 case URX_DOTANY_ALL:
5240 {
5241 // . in dot-matches-all (including new lines) mode
5242 if (fp->fInputIdx >= fActiveLimit) {
5243 // At end of input. Match failed. Backtrack out.
5244 fHitEnd = TRUE;
5245 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5246 break;
5247 }
5248
5249 // There is input left. Advance over one char, except if we are
5250 // at a cr/lf, advance over both of them.
5251 UChar32 c;
5252 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
5253 if (c==0x0d && fp->fInputIdx < fActiveLimit) {
5254 // In the case of a CR/LF, we need to advance over both.
5255 if (inputBuf[fp->fInputIdx] == 0x0a) {
5256 U16_FWD_1(inputBuf, fp->fInputIdx, fActiveLimit);
5257 }
5258 }
5259 }
5260 break;
5261
5262
5263 case URX_DOTANY_UNIX:
5264 {
5265 // '.' operator, matches all, but stops at end-of-line.
5266 // UNIX_LINES mode, so 0x0a is the only recognized line ending.
5267 if (fp->fInputIdx >= fActiveLimit) {
5268 // At end of input. Match failed. Backtrack out.
5269 fHitEnd = TRUE;
5270 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5271 break;
5272 }
5273
5274 // There is input left. Advance over one char, unless we've hit end-of-line
5275 UChar32 c;
5276 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
5277 if (c == 0x0a) {
5278 // End of line in normal mode. '.' does not match the \n
5279 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5280 }
5281 }
5282 break;
5283
5284
5285 case URX_JMP:
5286 fp->fPatIdx = opValue;
5287 break;
5288
5289 case URX_FAIL:
5290 isMatch = FALSE;
5291 goto breakFromLoop;
5292
5293 case URX_JMP_SAV:
5294 U_ASSERT(opValue < fPattern->fCompiledPat->size());
5295 fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current
5296 fp->fPatIdx = opValue; // Then JMP.
5297 break;
5298
5299 case URX_JMP_SAV_X:
5300 // This opcode is used with (x)+, when x can match a zero length string.
5301 // Same as JMP_SAV, except conditional on the match having made forward progress.
5302 // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the
5303 // data address of the input position at the start of the loop.
5304 {
5305 U_ASSERT(opValue > 0 && opValue < fPattern->fCompiledPat->size());
5306 int32_t stoOp = (int32_t)pat[opValue-1];
5307 U_ASSERT(URX_TYPE(stoOp) == URX_STO_INP_LOC);
5308 int32_t frameLoc = URX_VAL(stoOp);
5309 U_ASSERT(frameLoc >= 0 && frameLoc < fFrameSize);
5310 int32_t prevInputIdx = (int32_t)fp->fExtra[frameLoc];
5311 U_ASSERT(prevInputIdx <= fp->fInputIdx);
5312 if (prevInputIdx < fp->fInputIdx) {
5313 // The match did make progress. Repeat the loop.
5314 fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current
5315 fp->fPatIdx = opValue;
5316 fp->fExtra[frameLoc] = fp->fInputIdx;
5317 }
5318 // If the input position did not advance, we do nothing here,
5319 // execution will fall out of the loop.
5320 }
5321 break;
5322
5323 case URX_CTR_INIT:
5324 {
5325 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
5326 fp->fExtra[opValue] = 0; // Set the loop counter variable to zero
5327
5328 // Pick up the three extra operands that CTR_INIT has, and
5329 // skip the pattern location counter past
5330 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
5331 fp->fPatIdx += 3;
5332 int32_t loopLoc = URX_VAL(pat[instrOperandLoc]);
5333 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
5334 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
5335 U_ASSERT(minCount>=0);
5336 U_ASSERT(maxCount>=minCount || maxCount==-1);
5337 U_ASSERT(loopLoc>fp->fPatIdx);
5338
5339 if (minCount == 0) {
5340 fp = StateSave(fp, loopLoc+1, status);
5341 }
5342 if (maxCount == 0) {
5343 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5344 }
5345 }
5346 break;
5347
5348 case URX_CTR_LOOP:
5349 {
5350 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
5351 int32_t initOp = (int32_t)pat[opValue];
5352 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT);
5353 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
5354 int32_t minCount = (int32_t)pat[opValue+2];
5355 int32_t maxCount = (int32_t)pat[opValue+3];
5356 // Increment the counter. Note: we DIDN'T worry about counter
5357 // overflow, since the data comes from UnicodeStrings, which
5358 // stores its length in an int32_t. Do we have to think about
5359 // this now that we're using UText? Probably not, since the length
5360 // in UChar32s is still an int32_t.
5361 (*pCounter)++;
5362 U_ASSERT(*pCounter > 0);
5363 if ((uint64_t)*pCounter >= (uint32_t)maxCount) {
5364 U_ASSERT(*pCounter == maxCount || maxCount == -1);
5365 break;
5366 }
5367 if (*pCounter >= minCount) {
5368 fp = StateSave(fp, fp->fPatIdx, status);
5369 }
5370 fp->fPatIdx = opValue + 4; // Loop back.
5371 }
5372 break;
5373
5374 case URX_CTR_INIT_NG:
5375 {
5376 // Initialize a non-greedy loop
5377 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
5378 fp->fExtra[opValue] = 0; // Set the loop counter variable to zero
5379
5380 // Pick up the three extra operands that CTR_INIT has, and
5381 // skip the pattern location counter past
5382 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
5383 fp->fPatIdx += 3;
5384 int32_t loopLoc = URX_VAL(pat[instrOperandLoc]);
5385 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
5386 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
5387 U_ASSERT(minCount>=0);
5388 U_ASSERT(maxCount>=minCount || maxCount==-1);
5389 U_ASSERT(loopLoc>fp->fPatIdx);
5390
5391 if (minCount == 0) {
5392 if (maxCount != 0) {
5393 fp = StateSave(fp, fp->fPatIdx, status);
5394 }
5395 fp->fPatIdx = loopLoc+1; // Continue with stuff after repeated block
5396 }
5397 }
5398 break;
5399
5400 case URX_CTR_LOOP_NG:
5401 {
5402 // Non-greedy {min, max} loops
5403 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
5404 int32_t initOp = (int32_t)pat[opValue];
5405 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT_NG);
5406 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
5407 int32_t minCount = (int32_t)pat[opValue+2];
5408 int32_t maxCount = (int32_t)pat[opValue+3];
5409 // Increment the counter. Note: we DIDN'T worry about counter
5410 // overflow, since the data comes from UnicodeStrings, which
5411 // stores its length in an int32_t. Do we have to think about
5412 // this now that we're using UText? Probably not, since the length
5413 // in UChar32s is still an int32_t.
5414 (*pCounter)++;
5415 U_ASSERT(*pCounter > 0);
5416
5417 if ((uint64_t)*pCounter >= (uint32_t)maxCount) {
5418 // The loop has matched the maximum permitted number of times.
5419 // Break out of here with no action. Matching will
5420 // continue with the following pattern.
5421 U_ASSERT(*pCounter == maxCount || maxCount == -1);
5422 break;
5423 }
5424
5425 if (*pCounter < minCount) {
5426 // We haven't met the minimum number of matches yet.
5427 // Loop back for another one.
5428 fp->fPatIdx = opValue + 4; // Loop back.
5429 } else {
5430 // We do have the minimum number of matches.
5431 // Fall into the following pattern, but first do
5432 // a state save to the top of the loop, so that a failure
5433 // in the following pattern will try another iteration of the loop.
5434 fp = StateSave(fp, opValue + 4, status);
5435 }
5436 }
5437 break;
5438
5439 case URX_STO_SP:
5440 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
5441 fData[opValue] = fStack->size();
5442 break;
5443
5444 case URX_LD_SP:
5445 {
5446 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
5447 int32_t newStackSize = (int32_t)fData[opValue];
5448 U_ASSERT(newStackSize <= fStack->size());
5449 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
5450 if (newFP == (int64_t *)fp) {
5451 break;
5452 }
5453 int32_t i;
5454 for (i=0; i<fFrameSize; i++) {
5455 newFP[i] = ((int64_t *)fp)[i];
5456 }
5457 fp = (REStackFrame *)newFP;
5458 fStack->setSize(newStackSize);
5459 }
5460 break;
5461
5462 case URX_BACKREF:
5463 case URX_BACKREF_I:
5464 {
5465 U_ASSERT(opValue < fFrameSize);
5466 int64_t groupStartIdx = fp->fExtra[opValue];
5467 int64_t groupEndIdx = fp->fExtra[opValue+1];
5468 U_ASSERT(groupStartIdx <= groupEndIdx);
5469 int64_t len = groupEndIdx-groupStartIdx;
5470 if (groupStartIdx < 0) {
5471 // This capture group has not participated in the match thus far,
5472 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match.
5473 }
5474
5475 if (len == 0) {
5476 // The capture group match was of an empty string.
5477 // Verified by testing: Perl matches succeed in this case, so
5478 // we do too.
5479 break;
5480 }
5481
5482 UBool haveMatch = FALSE;
5483 if (fp->fInputIdx + len <= fActiveLimit) {
5484 if (opType == URX_BACKREF) {
5485 if (u_strncmp(inputBuf+groupStartIdx, inputBuf+fp->fInputIdx, (int32_t)len) == 0) {
5486 haveMatch = TRUE;
5487 }
5488 } else {
5489 if (u_strncasecmp(inputBuf+groupStartIdx, inputBuf+fp->fInputIdx,
5490 (int32_t)len, U_FOLD_CASE_DEFAULT) == 0) {
5491 haveMatch = TRUE;
5492 }
5493 }
5494 } else {
5495 // TODO: probably need to do a partial string comparison, and only
5496 // set HitEnd if the available input matched. Ticket #6074
5497 fHitEnd = TRUE;
5498 }
5499 if (haveMatch) {
5500 fp->fInputIdx += len; // Match. Advance current input position.
5501 } else {
5502 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match.
5503 }
5504 }
5505 break;
5506
5507 case URX_STO_INP_LOC:
5508 {
5509 U_ASSERT(opValue >= 0 && opValue < fFrameSize);
5510 fp->fExtra[opValue] = fp->fInputIdx;
5511 }
5512 break;
5513
5514 case URX_JMPX:
5515 {
5516 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
5517 fp->fPatIdx += 1;
5518 int32_t dataLoc = URX_VAL(pat[instrOperandLoc]);
5519 U_ASSERT(dataLoc >= 0 && dataLoc < fFrameSize);
5520 int32_t savedInputIdx = (int32_t)fp->fExtra[dataLoc];
5521 U_ASSERT(savedInputIdx <= fp->fInputIdx);
5522 if (savedInputIdx < fp->fInputIdx) {
5523 fp->fPatIdx = opValue; // JMP
5524 } else {
5525 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no progress in loop.
5526 }
5527 }
5528 break;
5529
5530 case URX_LA_START:
5531 {
5532 // Entering a lookahead block.
5533 // Save Stack Ptr, Input Pos.
5534 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5535 fData[opValue] = fStack->size();
5536 fData[opValue+1] = fp->fInputIdx;
5537 fActiveStart = fLookStart; // Set the match region change for
5538 fActiveLimit = fLookLimit; // transparent bounds.
5539 }
5540 break;
5541
5542 case URX_LA_END:
5543 {
5544 // Leaving a look-ahead block.
5545 // restore Stack Ptr, Input Pos to positions they had on entry to block.
5546 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5547 int32_t stackSize = fStack->size();
5548 int32_t newStackSize = (int32_t)fData[opValue];
5549 U_ASSERT(stackSize >= newStackSize);
5550 if (stackSize > newStackSize) {
5551 // Copy the current top frame back to the new (cut back) top frame.
5552 // This makes the capture groups from within the look-ahead
5553 // expression available.
5554 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
5555 int32_t i;
5556 for (i=0; i<fFrameSize; i++) {
5557 newFP[i] = ((int64_t *)fp)[i];
5558 }
5559 fp = (REStackFrame *)newFP;
5560 fStack->setSize(newStackSize);
5561 }
5562 fp->fInputIdx = fData[opValue+1];
5563
5564 // Restore the active region bounds in the input string; they may have
5565 // been changed because of transparent bounds on a Region.
5566 fActiveStart = fRegionStart;
5567 fActiveLimit = fRegionLimit;
5568 }
5569 break;
5570
5571 case URX_ONECHAR_I:
5572 if (fp->fInputIdx < fActiveLimit) {
5573 UChar32 c;
5574 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
5575 if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) {
5576 break;
5577 }
5578 } else {
5579 fHitEnd = TRUE;
5580 }
5581
5582 #ifdef REGEX_SMART_BACKTRACKING
5583 if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) {
5584 REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
5585 if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
5586 UBool success = FALSE;
5587 int64_t reverseIndex = fp->fInputIdx;
5588 UChar32 c;
5589 while (reverseIndex > backSearchIndex) {
5590 U16_PREV(inputBuf, backSearchIndex, reverseIndex, c);
5591 if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) {
5592 success = TRUE;
5593 break;
5594 } else if (c == U_SENTINEL) {
5595 break;
5596 }
5597 }
5598 if (success) {
5599 fHitEnd = FALSE;
5600 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5601 fp->fInputIdx = reverseIndex;
5602 if (fp->fInputIdx > backSearchIndex) {
5603 fp = StateSave(fp, fp->fPatIdx, status);
5604 }
5605 fp->fPatIdx++; // Skip the LOOP_C, we just did that
5606 break;
5607 }
5608 }
5609 }
5610 #endif
5611
5612 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5613 break;
5614
5615 case URX_STRING_I:
5616 {
5617 // Test input against a literal string.
5618 // Strings require two slots in the compiled pattern, one for the
5619 // offset to the string text, and one for the length.
5620 const UCaseProps *csp = ucase_getSingleton();
5621 {
5622 int32_t stringStartIdx, stringLen;
5623 stringStartIdx = opValue;
5624
5625 op = (int32_t)pat[fp->fPatIdx];
5626 fp->fPatIdx++;
5627 opType = URX_TYPE(op);
5628 opValue = URX_VAL(op);
5629 U_ASSERT(opType == URX_STRING_LEN);
5630 stringLen = opValue;
5631
5632 const UChar *patternChars = litText+stringStartIdx;
5633 const UChar *patternEnd = patternChars+stringLen;
5634
5635 const UChar *foldChars = NULL;
5636 int32_t foldOffset, foldLength;
5637 UChar32 c;
5638 // BEGIN android-changed
5639 // For ICU ticket#8824
5640 UBool c_is_valid = FALSE;
5641
5642 #ifdef REGEX_SMART_BACKTRACKING
5643 int32_t originalInputIdx = fp->fInputIdx;
5644 #endif
5645 UBool success = TRUE;
5646
5647 foldOffset = foldLength = 0;
5648
5649 while (patternChars < patternEnd && success) {
5650 if (fp->fInputIdx < fActiveLimit) { // don't read past end of string
5651 if(foldOffset < foldLength) {
5652 U16_NEXT_UNSAFE(foldChars, foldOffset, c);
5653 c_is_valid = TRUE;
5654 } else {
5655 // test pre-condition of U16_NEXT: i < length
5656 U_ASSERT(fp->fInputIdx < fActiveLimit);
5657 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
5658 c_is_valid = TRUE;
5659 foldLength = ucase_toFullFolding(csp, c, &foldChars, U_FOLD_CASE_DEFAULT);
5660 if(foldLength >= 0) {
5661 if(foldLength <= UCASE_MAX_STRING_LENGTH) { // !!!: Does not correctly handle chars that fold to 0-length strings
5662 foldOffset = 0;
5663 U16_NEXT_UNSAFE(foldChars, foldOffset, c);
5664 } else {
5665 c = foldLength;
5666 foldLength = foldOffset; // to avoid reading chars from the folding buffer
5667 }
5668 }
5669 }
5670 } else {
5671 c_is_valid = FALSE;
5672 }
5673
5674 if (fp->fInputIdx <= fActiveLimit && c_is_valid) {
5675 if (U_IS_BMP(c)) {
5676 success = (*patternChars == c);
5677 patternChars += 1;
5678 } else if (patternChars+1 < patternEnd) {
5679 success = (*patternChars == U16_LEAD(c) && *(patternChars+1) == U16_TRAIL(c));
5680 patternChars += 2;
5681 }
5682 } else {
5683 success = FALSE;
5684 fHitEnd = TRUE; // TODO: See ticket 6074
5685 }
5686 }
5687 // END android-changed
5688
5689 if (!success) {
5690 #ifdef REGEX_SMART_BACKTRACKING
5691 if (fp->fInputIdx > backSearchIndex && fStack->size()) {
5692 REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
5693 if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
5694 // Reset to last start point
5695 int64_t reverseIndex = originalInputIdx;
5696 patternChars = litText+stringStartIdx;
5697
5698 // Search backwards for a possible start
5699 do {
5700 U16_PREV(inputBuf, backSearchIndex, reverseIndex, c);
5701 foldLength = ucase_toFullFolding(csp, c, &foldChars, U_FOLD_CASE_DEFAULT);
5702 if(foldLength >= 0) {
5703 if(foldLength <= UCASE_MAX_STRING_LENGTH) { // !!!: Does not correctly handle chars that fold to 0-length strings
5704 foldOffset = 0;
5705 U16_NEXT_UNSAFE(foldChars, foldOffset, c);
5706 } else {
5707 c = foldLength;
5708 foldLength = foldOffset; // to avoid reading chars from the folding buffer
5709 }
5710 }
5711
5712 if ((U_IS_BMP(c) && *patternChars == c) ||
5713 (*patternChars == U16_LEAD(c) && *(patternChars+1) == U16_TRAIL(c))) {
5714 success = TRUE;
5715 break;
5716 }
5717 } while (reverseIndex > backSearchIndex);
5718
5719 // And try again
5720 if (success) {
5721 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5722 fp->fInputIdx = reverseIndex;
5723 if (fp->fInputIdx > backSearchIndex) {
5724 fp = StateSave(fp, fp->fPatIdx, status);
5725 }
5726 fp->fPatIdx++; // Skip the LOOP_C, we just did that
5727 break;
5728 }
5729 }
5730 }
5731 #endif
5732 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5733 }
5734 }
5735 }
5736 break;
5737
5738 case URX_LB_START:
5739 {
5740 // Entering a look-behind block.
5741 // Save Stack Ptr, Input Pos.
5742 // TODO: implement transparent bounds. Ticket #6067
5743 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5744 fData[opValue] = fStack->size();
5745 fData[opValue+1] = fp->fInputIdx;
5746 // Init the variable containing the start index for attempted matches.
5747 fData[opValue+2] = -1;
5748 // Save input string length, then reset to pin any matches to end at
5749 // the current position.
5750 fData[opValue+3] = fActiveLimit;
5751 fActiveLimit = fp->fInputIdx;
5752 }
5753 break;
5754
5755
5756 case URX_LB_CONT:
5757 {
5758 // Positive Look-Behind, at top of loop checking for matches of LB expression
5759 // at all possible input starting positions.
5760
5761 // Fetch the min and max possible match lengths. They are the operands
5762 // of this op in the pattern.
5763 int32_t minML = (int32_t)pat[fp->fPatIdx++];
5764 int32_t maxML = (int32_t)pat[fp->fPatIdx++];
5765 U_ASSERT(minML <= maxML);
5766 U_ASSERT(minML >= 0);
5767
5768 // Fetch (from data) the last input index where a match was attempted.
5769 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5770 int64_t *lbStartIdx = &fData[opValue+2];
5771 if (*lbStartIdx < 0) {
5772 // First time through loop.
5773 *lbStartIdx = fp->fInputIdx - minML;
5774 } else {
5775 // 2nd through nth time through the loop.
5776 // Back up start position for match by one.
5777 if (*lbStartIdx == 0) {
5778 (*lbStartIdx)--;
5779 } else {
5780 U16_BACK_1(inputBuf, 0, *lbStartIdx);
5781 }
5782 }
5783
5784 if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) {
5785 // We have tried all potential match starting points without
5786 // getting a match. Backtrack out, and out of the
5787 // Look Behind altogether.
5788 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5789 int64_t restoreInputLen = fData[opValue+3];
5790 U_ASSERT(restoreInputLen >= fActiveLimit);
5791 U_ASSERT(restoreInputLen <= fInputLength);
5792 fActiveLimit = restoreInputLen;
5793 break;
5794 }
5795
5796 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
5797 // (successful match will fall off the end of the loop.)
5798 fp = StateSave(fp, fp->fPatIdx-3, status);
5799 fp->fInputIdx = *lbStartIdx;
5800 }
5801 break;
5802
5803 case URX_LB_END:
5804 // End of a look-behind block, after a successful match.
5805 {
5806 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5807 if (fp->fInputIdx != fActiveLimit) {
5808 // The look-behind expression matched, but the match did not
5809 // extend all the way to the point that we are looking behind from.
5810 // FAIL out of here, which will take us back to the LB_CONT, which
5811 // will retry the match starting at another position or fail
5812 // the look-behind altogether, whichever is appropriate.
5813 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5814 break;
5815 }
5816
5817 // Look-behind match is good. Restore the orignal input string length,
5818 // which had been truncated to pin the end of the lookbehind match to the
5819 // position being looked-behind.
5820 int64_t originalInputLen = fData[opValue+3];
5821 U_ASSERT(originalInputLen >= fActiveLimit);
5822 U_ASSERT(originalInputLen <= fInputLength);
5823 fActiveLimit = originalInputLen;
5824 }
5825 break;
5826
5827
5828 case URX_LBN_CONT:
5829 {
5830 // Negative Look-Behind, at top of loop checking for matches of LB expression
5831 // at all possible input starting positions.
5832
5833 // Fetch the extra parameters of this op.
5834 int32_t minML = (int32_t)pat[fp->fPatIdx++];
5835 int32_t maxML = (int32_t)pat[fp->fPatIdx++];
5836 int32_t continueLoc = (int32_t)pat[fp->fPatIdx++];
5837 continueLoc = URX_VAL(continueLoc);
5838 U_ASSERT(minML <= maxML);
5839 U_ASSERT(minML >= 0);
5840 U_ASSERT(continueLoc > fp->fPatIdx);
5841
5842 // Fetch (from data) the last input index where a match was attempted.
5843 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5844 int64_t *lbStartIdx = &fData[opValue+2];
5845 if (*lbStartIdx < 0) {
5846 // First time through loop.
5847 *lbStartIdx = fp->fInputIdx - minML;
5848 } else {
5849 // 2nd through nth time through the loop.
5850 // Back up start position for match by one.
5851 if (*lbStartIdx == 0) {
5852 (*lbStartIdx)--; // Because U16_BACK is unsafe starting at 0.
5853 } else {
5854 U16_BACK_1(inputBuf, 0, *lbStartIdx);
5855 }
5856 }
5857
5858 if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) {
5859 // We have tried all potential match starting points without
5860 // getting a match, which means that the negative lookbehind as
5861 // a whole has succeeded. Jump forward to the continue location
5862 int64_t restoreInputLen = fData[opValue+3];
5863 U_ASSERT(restoreInputLen >= fActiveLimit);
5864 U_ASSERT(restoreInputLen <= fInputLength);
5865 fActiveLimit = restoreInputLen;
5866 fp->fPatIdx = continueLoc;
5867 break;
5868 }
5869
5870 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
5871 // (successful match will cause a FAIL out of the loop altogether.)
5872 fp = StateSave(fp, fp->fPatIdx-4, status);
5873 fp->fInputIdx = *lbStartIdx;
5874 }
5875 break;
5876
5877 case URX_LBN_END:
5878 // End of a negative look-behind block, after a successful match.
5879 {
5880 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5881 if (fp->fInputIdx != fActiveLimit) {
5882 // The look-behind expression matched, but the match did not
5883 // extend all the way to the point that we are looking behind from.
5884 // FAIL out of here, which will take us back to the LB_CONT, which
5885 // will retry the match starting at another position or succeed
5886 // the look-behind altogether, whichever is appropriate.
5887 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5888 break;
5889 }
5890
5891 // Look-behind expression matched, which means look-behind test as
5892 // a whole Fails
5893
5894 // Restore the orignal input string length, which had been truncated
5895 // inorder to pin the end of the lookbehind match
5896 // to the position being looked-behind.
5897 int64_t originalInputLen = fData[opValue+3];
5898 U_ASSERT(originalInputLen >= fActiveLimit);
5899 U_ASSERT(originalInputLen <= fInputLength);
5900 fActiveLimit = originalInputLen;
5901
5902 // Restore original stack position, discarding any state saved
5903 // by the successful pattern match.
5904 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5905 int32_t newStackSize = (int32_t)fData[opValue];
5906 U_ASSERT(fStack->size() > newStackSize);
5907 fStack->setSize(newStackSize);
5908
5909 // FAIL, which will take control back to someplace
5910 // prior to entering the look-behind test.
5911 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5912 }
5913 break;
5914
5915
5916 case URX_LOOP_SR_I:
5917 // Loop Initialization for the optimized implementation of
5918 // [some character set]*
5919 // This op scans through all matching input.
5920 // The following LOOP_C op emulates stack unwinding if the following pattern fails.
5921 {
5922 U_ASSERT(opValue > 0 && opValue < sets->size());
5923 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
5924 UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue);
5925
5926 // Loop through input, until either the input is exhausted or
5927 // we reach a character that is not a member of the set.
5928 int32_t ix = (int32_t)fp->fInputIdx;
5929 for (;;) {
5930 if (ix >= fActiveLimit) {
5931 fHitEnd = TRUE;
5932 break;
5933 }
5934 UChar32 c;
5935 U16_NEXT(inputBuf, ix, fActiveLimit, c);
5936 if (c<256) {
5937 if (s8->contains(c) == FALSE) {
5938 U16_BACK_1(inputBuf, 0, ix);
5939 break;
5940 }
5941 } else {
5942 if (s->contains(c) == FALSE) {
5943 U16_BACK_1(inputBuf, 0, ix);
5944 break;
5945 }
5946 }
5947 }
5948
5949 // If there were no matching characters, skip over the loop altogether.
5950 // The loop doesn't run at all, a * op always succeeds.
5951 if (ix == fp->fInputIdx) {
5952 fp->fPatIdx++; // skip the URX_LOOP_C op.
5953 break;
5954 }
5955
5956 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
5957 // must follow. It's operand is the stack location
5958 // that holds the starting input index for the match of this [set]*
5959 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
5960 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
5961 int32_t stackLoc = URX_VAL(loopcOp);
5962 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
5963 fp->fExtra[stackLoc] = fp->fInputIdx;
5964 #ifdef REGEX_SMART_BACKTRACKING
5965 backSearchIndex = fp->fInputIdx;
5966 #endif
5967 fp->fInputIdx = ix;
5968
5969 // Save State to the URX_LOOP_C op that follows this one,
5970 // so that match failures in the following code will return to there.
5971 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
5972 fp = StateSave(fp, fp->fPatIdx, status);
5973 fp->fPatIdx++;
5974 }
5975 break;
5976
5977
5978 case URX_LOOP_DOT_I:
5979 // Loop Initialization for the optimized implementation of .*
5980 // This op scans through all remaining input.
5981 // The following LOOP_C op emulates stack unwinding if the following pattern fails.
5982 {
5983 // Loop through input until the input is exhausted (we reach an end-of-line)
5984 // In DOTALL mode, we can just go straight to the end of the input.
5985 int32_t ix;
5986 if ((opValue & 1) == 1) {
5987 // Dot-matches-All mode. Jump straight to the end of the string.
5988 ix = (int32_t)fActiveLimit;
5989 fHitEnd = TRUE;
5990 } else {
5991 // NOT DOT ALL mode. Line endings do not match '.'
5992 // Scan forward until a line ending or end of input.
5993 ix = (int32_t)fp->fInputIdx;
5994 for (;;) {
5995 if (ix >= fActiveLimit) {
5996 fHitEnd = TRUE;
5997 break;
5998 }
5999 UChar32 c;
6000 U16_NEXT(inputBuf, ix, fActiveLimit, c); // c = inputBuf[ix++]
6001 if ((c & 0x7f) <= 0x29) { // Fast filter of non-new-line-s
6002 if ((c == 0x0a) || // 0x0a is newline in both modes.
6003 (((opValue & 2) == 0) && // IF not UNIX_LINES mode
6004 ((c<=0x0d && c>=0x0a) || c==0x85 || c==0x2028 || c==0x2029))) {
6005 // char is a line ending. Put the input pos back to the
6006 // line ending char, and exit the scanning loop.
6007 U16_BACK_1(inputBuf, 0, ix);
6008 break;
6009 }
6010 }
6011 }
6012 }
6013
6014 // If there were no matching characters, skip over the loop altogether.
6015 // The loop doesn't run at all, a * op always succeeds.
6016 if (ix == fp->fInputIdx) {
6017 fp->fPatIdx++; // skip the URX_LOOP_C op.
6018 break;
6019 }
6020
6021 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
6022 // must follow. It's operand is the stack location
6023 // that holds the starting input index for the match of this .*
6024 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
6025 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
6026 int32_t stackLoc = URX_VAL(loopcOp);
6027 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
6028 fp->fExtra[stackLoc] = fp->fInputIdx;
6029 #ifdef REGEX_SMART_BACKTRACKING
6030 backSearchIndex = fp->fInputIdx;
6031 #endif
6032 fp->fInputIdx = ix;
6033
6034 // Save State to the URX_LOOP_C op that follows this one,
6035 // so that match failures in the following code will return to there.
6036 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
6037 fp = StateSave(fp, fp->fPatIdx, status);
6038 fp->fPatIdx++;
6039 }
6040 break;
6041
6042
6043 case URX_LOOP_C:
6044 {
6045 U_ASSERT(opValue>=0 && opValue<fFrameSize);
6046 backSearchIndex = (int32_t)fp->fExtra[opValue];
6047 U_ASSERT(backSearchIndex <= fp->fInputIdx);
6048 if (backSearchIndex == fp->fInputIdx) {
6049 // We've backed up the input idx to the point that the loop started.
6050 // The loop is done. Leave here without saving state.
6051 // Subsequent failures won't come back here.
6052 break;
6053 }
6054 // Set up for the next iteration of the loop, with input index
6055 // backed up by one from the last time through,
6056 // and a state save to this instruction in case the following code fails again.
6057 // (We're going backwards because this loop emulates stack unwinding, not
6058 // the initial scan forward.)
6059 U_ASSERT(fp->fInputIdx > 0);
6060 UChar32 prevC;
6061 U16_PREV(inputBuf, 0, fp->fInputIdx, prevC); // !!!: should this 0 be one of f*Limit?
6062
6063 if (prevC == 0x0a &&
6064 fp->fInputIdx > backSearchIndex &&
6065 inputBuf[fp->fInputIdx-1] == 0x0d) {
6066 int32_t prevOp = (int32_t)pat[fp->fPatIdx-2];
6067 if (URX_TYPE(prevOp) == URX_LOOP_DOT_I) {
6068 // .*, stepping back over CRLF pair.
6069 U16_BACK_1(inputBuf, 0, fp->fInputIdx);
6070 }
6071 }
6072
6073
6074 fp = StateSave(fp, fp->fPatIdx-1, status);
6075 }
6076 break;
6077
6078
6079
6080 default:
6081 // Trouble. The compiled pattern contains an entry with an
6082 // unrecognized type tag.
6083 U_ASSERT(FALSE);
6084 }
6085
6086 if (U_FAILURE(status)) {
6087 isMatch = FALSE;
6088 break;
6089 }
6090 }
6091
6092 breakFromLoop:
6093 fMatch = isMatch;
6094 if (isMatch) {
6095 fLastMatchEnd = fMatchEnd;
6096 fMatchStart = startIdx;
6097 fMatchEnd = fp->fInputIdx;
6098 if (fTraceDebug) {
6099 REGEX_RUN_DEBUG_PRINTF(("Match. start=%d end=%d\n\n", fMatchStart, fMatchEnd));
6100 }
6101 }
6102 else
6103 {
6104 if (fTraceDebug) {
6105 REGEX_RUN_DEBUG_PRINTF(("No match\n\n"));
6106 }
6107 }
6108
6109 fFrame = fp; // The active stack frame when the engine stopped.
6110 // Contains the capture group results that we need to
6111 // access later.
6112
6113 return;
6114 }
6115
6116
6117 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RegexMatcher)
6118
6119 U_NAMESPACE_END
6120
6121 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS
6122
6123