1 /*
2 **************************************************************************
3 * Copyright (C) 2002-2011 International Business Machines Corporation *
4 * and others. All rights reserved. *
5 **************************************************************************
6 */
7 //
8 // file: rematch.cpp
9 //
10 // Contains the implementation of class RegexMatcher,
11 // which is one of the main API classes for the ICU regular expression package.
12 //
13
14 #include "unicode/utypes.h"
15 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
16
17 #include "unicode/regex.h"
18 #include "unicode/uniset.h"
19 #include "unicode/uchar.h"
20 #include "unicode/ustring.h"
21 #include "unicode/rbbi.h"
22 #include "uassert.h"
23 #include "cmemory.h"
24 #include "uvector.h"
25 #include "uvectr32.h"
26 #include "uvectr64.h"
27 #include "regeximp.h"
28 #include "regexst.h"
29 #include "regextxt.h"
30 #include "ucase.h"
31
32 // #include <malloc.h> // Needed for heapcheck testing
33
34
35 // Find progress callback
36 // ----------------------
37 // Macro to inline test & call to ReportFindProgress(). Eliminates unnecessary function call.
38 //
39 #define REGEXFINDPROGRESS_INTERRUPT(pos, status) \
40 (fFindProgressCallbackFn != NULL) && (ReportFindProgress(pos, status) == FALSE)
41
42
43 // Smart Backtracking
44 // ------------------
45 // When a failure would go back to a LOOP_C instruction,
46 // strings, characters, and setrefs scan backwards for a valid start
47 // character themselves, pop the stack, and save state, emulating the
48 // LOOP_C's effect but assured that the next character of input is a
49 // possible matching character.
50 //
51 // Good idea in theory; unfortunately it only helps out a few specific
52 // cases and slows the engine down a little in the rest.
53
54 //#define REGEX_SMART_BACKTRACKING 1
55
56 U_NAMESPACE_BEGIN
57
58 // Default limit for the size of the back track stack, to avoid system
59 // failures causedby heap exhaustion. Units are in 32 bit words, not bytes.
60 // This value puts ICU's limits higher than most other regexp implementations,
61 // which use recursion rather than the heap, and take more storage per
62 // backtrack point.
63 //
64 static const int32_t DEFAULT_BACKTRACK_STACK_CAPACITY = 8000000;
65
66 // Time limit counter constant.
67 // Time limits for expression evaluation are in terms of quanta of work by
68 // the engine, each of which is 10,000 state saves.
69 // This constant determines that state saves per tick number.
70 static const int32_t TIMER_INITIAL_VALUE = 10000;
71
72 //-----------------------------------------------------------------------------
73 //
74 // Constructor and Destructor
75 //
76 //-----------------------------------------------------------------------------
RegexMatcher(const RegexPattern * pat)77 RegexMatcher::RegexMatcher(const RegexPattern *pat) {
78 fDeferredStatus = U_ZERO_ERROR;
79 init(fDeferredStatus);
80 if (U_FAILURE(fDeferredStatus)) {
81 return;
82 }
83 if (pat==NULL) {
84 fDeferredStatus = U_ILLEGAL_ARGUMENT_ERROR;
85 return;
86 }
87 fPattern = pat;
88 init2(RegexStaticSets::gStaticSets->fEmptyText, fDeferredStatus);
89 }
90
91
92
RegexMatcher(const UnicodeString & regexp,const UnicodeString & input,uint32_t flags,UErrorCode & status)93 RegexMatcher::RegexMatcher(const UnicodeString ®exp, const UnicodeString &input,
94 uint32_t flags, UErrorCode &status) {
95 init(status);
96 if (U_FAILURE(status)) {
97 return;
98 }
99 UParseError pe;
100 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status);
101 fPattern = fPatternOwned;
102
103 UText inputText = UTEXT_INITIALIZER;
104 utext_openConstUnicodeString(&inputText, &input, &status);
105 init2(&inputText, status);
106 utext_close(&inputText);
107
108 fInputUniStrMaybeMutable = TRUE;
109 }
110
111
RegexMatcher(UText * regexp,UText * input,uint32_t flags,UErrorCode & status)112 RegexMatcher::RegexMatcher(UText *regexp, UText *input,
113 uint32_t flags, UErrorCode &status) {
114 init(status);
115 if (U_FAILURE(status)) {
116 return;
117 }
118 UParseError pe;
119 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status);
120 if (U_FAILURE(status)) {
121 return;
122 }
123
124 fPattern = fPatternOwned;
125 init2(input, status);
126 }
127
128
RegexMatcher(const UnicodeString & regexp,uint32_t flags,UErrorCode & status)129 RegexMatcher::RegexMatcher(const UnicodeString ®exp,
130 uint32_t flags, UErrorCode &status) {
131 init(status);
132 if (U_FAILURE(status)) {
133 return;
134 }
135 UParseError pe;
136 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status);
137 if (U_FAILURE(status)) {
138 return;
139 }
140 fPattern = fPatternOwned;
141 init2(RegexStaticSets::gStaticSets->fEmptyText, status);
142 }
143
RegexMatcher(UText * regexp,uint32_t flags,UErrorCode & status)144 RegexMatcher::RegexMatcher(UText *regexp,
145 uint32_t flags, UErrorCode &status) {
146 init(status);
147 if (U_FAILURE(status)) {
148 return;
149 }
150 UParseError pe;
151 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status);
152 if (U_FAILURE(status)) {
153 return;
154 }
155
156 fPattern = fPatternOwned;
157 init2(RegexStaticSets::gStaticSets->fEmptyText, status);
158 }
159
160
161
162
~RegexMatcher()163 RegexMatcher::~RegexMatcher() {
164 delete fStack;
165 if (fData != fSmallData) {
166 uprv_free(fData);
167 fData = NULL;
168 }
169 if (fPatternOwned) {
170 delete fPatternOwned;
171 fPatternOwned = NULL;
172 fPattern = NULL;
173 }
174
175 if (fInput) {
176 delete fInput;
177 }
178 if (fInputText) {
179 utext_close(fInputText);
180 }
181 if (fAltInputText) {
182 utext_close(fAltInputText);
183 }
184
185 #if UCONFIG_NO_BREAK_ITERATION==0
186 delete fWordBreakItr;
187 #endif
188 }
189
190 //
191 // init() common initialization for use by all constructors.
192 // Initialize all fields, get the object into a consistent state.
193 // This must be done even when the initial status shows an error,
194 // so that the object is initialized sufficiently well for the destructor
195 // to run safely.
196 //
init(UErrorCode & status)197 void RegexMatcher::init(UErrorCode &status) {
198 fPattern = NULL;
199 fPatternOwned = NULL;
200 fFrameSize = 0;
201 fRegionStart = 0;
202 fRegionLimit = 0;
203 fAnchorStart = 0;
204 fAnchorLimit = 0;
205 fLookStart = 0;
206 fLookLimit = 0;
207 fActiveStart = 0;
208 fActiveLimit = 0;
209 fTransparentBounds = FALSE;
210 fAnchoringBounds = TRUE;
211 fMatch = FALSE;
212 fMatchStart = 0;
213 fMatchEnd = 0;
214 fLastMatchEnd = -1;
215 fAppendPosition = 0;
216 fHitEnd = FALSE;
217 fRequireEnd = FALSE;
218 fStack = NULL;
219 fFrame = NULL;
220 fTimeLimit = 0;
221 fTime = 0;
222 fTickCounter = 0;
223 fStackLimit = DEFAULT_BACKTRACK_STACK_CAPACITY;
224 fCallbackFn = NULL;
225 fCallbackContext = NULL;
226 fFindProgressCallbackFn = NULL;
227 fFindProgressCallbackContext = NULL;
228 fTraceDebug = FALSE;
229 fDeferredStatus = status;
230 fData = fSmallData;
231 fWordBreakItr = NULL;
232
233 fStack = new UVector64(status);
234 fInputText = NULL;
235 fAltInputText = NULL;
236 fInput = NULL;
237 fInputLength = 0;
238 fInputUniStrMaybeMutable = FALSE;
239
240 if (U_FAILURE(status)) {
241 fDeferredStatus = status;
242 }
243 }
244
245 //
246 // init2() Common initialization for use by RegexMatcher constructors, part 2.
247 // This handles the common setup to be done after the Pattern is available.
248 //
init2(UText * input,UErrorCode & status)249 void RegexMatcher::init2(UText *input, UErrorCode &status) {
250 if (U_FAILURE(status)) {
251 fDeferredStatus = status;
252 return;
253 }
254
255 if (fPattern->fDataSize > (int32_t)(sizeof(fSmallData)/sizeof(fSmallData[0]))) {
256 fData = (int64_t *)uprv_malloc(fPattern->fDataSize * sizeof(int64_t));
257 if (fData == NULL) {
258 status = fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
259 return;
260 }
261 }
262
263 reset(input);
264 setStackLimit(DEFAULT_BACKTRACK_STACK_CAPACITY, status);
265 if (U_FAILURE(status)) {
266 fDeferredStatus = status;
267 return;
268 }
269 }
270
271
272 static const UChar BACKSLASH = 0x5c;
273 static const UChar DOLLARSIGN = 0x24;
274 //--------------------------------------------------------------------------------
275 //
276 // appendReplacement
277 //
278 //--------------------------------------------------------------------------------
appendReplacement(UnicodeString & dest,const UnicodeString & replacement,UErrorCode & status)279 RegexMatcher &RegexMatcher::appendReplacement(UnicodeString &dest,
280 const UnicodeString &replacement,
281 UErrorCode &status) {
282 UText replacementText = UTEXT_INITIALIZER;
283
284 utext_openConstUnicodeString(&replacementText, &replacement, &status);
285 if (U_SUCCESS(status)) {
286 UText resultText = UTEXT_INITIALIZER;
287 utext_openUnicodeString(&resultText, &dest, &status);
288
289 if (U_SUCCESS(status)) {
290 appendReplacement(&resultText, &replacementText, status);
291 utext_close(&resultText);
292 }
293 utext_close(&replacementText);
294 }
295
296 return *this;
297 }
298
299 //
300 // appendReplacement, UText mode
301 //
appendReplacement(UText * dest,UText * replacement,UErrorCode & status)302 RegexMatcher &RegexMatcher::appendReplacement(UText *dest,
303 UText *replacement,
304 UErrorCode &status) {
305 if (U_FAILURE(status)) {
306 return *this;
307 }
308 if (U_FAILURE(fDeferredStatus)) {
309 status = fDeferredStatus;
310 return *this;
311 }
312 if (fMatch == FALSE) {
313 status = U_REGEX_INVALID_STATE;
314 return *this;
315 }
316
317 // Copy input string from the end of previous match to start of current match
318 int64_t destLen = utext_nativeLength(dest);
319 if (fMatchStart > fAppendPosition) {
320 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
321 destLen += utext_replace(dest, destLen, destLen, fInputText->chunkContents+fAppendPosition,
322 (int32_t)(fMatchStart-fAppendPosition), &status);
323 } else {
324 int32_t len16;
325 if (UTEXT_USES_U16(fInputText)) {
326 len16 = (int32_t)(fMatchStart-fAppendPosition);
327 } else {
328 UErrorCode lengthStatus = U_ZERO_ERROR;
329 len16 = utext_extract(fInputText, fAppendPosition, fMatchStart, NULL, 0, &lengthStatus);
330 }
331 UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1));
332 if (inputChars == NULL) {
333 status = U_MEMORY_ALLOCATION_ERROR;
334 return *this;
335 }
336 utext_extract(fInputText, fAppendPosition, fMatchStart, inputChars, len16+1, &status);
337 destLen += utext_replace(dest, destLen, destLen, inputChars, len16, &status);
338 uprv_free(inputChars);
339 }
340 }
341 fAppendPosition = fMatchEnd;
342
343
344 // scan the replacement text, looking for substitutions ($n) and \escapes.
345 // TODO: optimize this loop by efficiently scanning for '$' or '\',
346 // move entire ranges not containing substitutions.
347 UTEXT_SETNATIVEINDEX(replacement, 0);
348 UChar32 c = UTEXT_NEXT32(replacement);
349 while (c != U_SENTINEL) {
350 if (c == BACKSLASH) {
351 // Backslash Escape. Copy the following char out without further checks.
352 // Note: Surrogate pairs don't need any special handling
353 // The second half wont be a '$' or a '\', and
354 // will move to the dest normally on the next
355 // loop iteration.
356 c = UTEXT_CURRENT32(replacement);
357 if (c == U_SENTINEL) {
358 break;
359 }
360
361 if (c==0x55/*U*/ || c==0x75/*u*/) {
362 // We have a \udddd or \Udddddddd escape sequence.
363 int32_t offset = 0;
364 struct URegexUTextUnescapeCharContext context = U_REGEX_UTEXT_UNESCAPE_CONTEXT(replacement);
365 UChar32 escapedChar = u_unescapeAt(uregex_utext_unescape_charAt, &offset, INT32_MAX, &context);
366 if (escapedChar != (UChar32)0xFFFFFFFF) {
367 if (U_IS_BMP(escapedChar)) {
368 UChar c16 = (UChar)escapedChar;
369 destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
370 } else {
371 UChar surrogate[2];
372 surrogate[0] = U16_LEAD(escapedChar);
373 surrogate[1] = U16_TRAIL(escapedChar);
374 if (U_SUCCESS(status)) {
375 destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);
376 }
377 }
378 // TODO: Report errors for mal-formed \u escapes?
379 // As this is, the original sequence is output, which may be OK.
380 if (context.lastOffset == offset) {
381 (void)UTEXT_PREVIOUS32(replacement);
382 } else if (context.lastOffset != offset-1) {
383 utext_moveIndex32(replacement, offset - context.lastOffset - 1);
384 }
385 }
386 } else {
387 (void)UTEXT_NEXT32(replacement);
388 // Plain backslash escape. Just put out the escaped character.
389 if (U_IS_BMP(c)) {
390 UChar c16 = (UChar)c;
391 destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
392 } else {
393 UChar surrogate[2];
394 surrogate[0] = U16_LEAD(c);
395 surrogate[1] = U16_TRAIL(c);
396 if (U_SUCCESS(status)) {
397 destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);
398 }
399 }
400 }
401 } else if (c != DOLLARSIGN) {
402 // Normal char, not a $. Copy it out without further checks.
403 if (U_IS_BMP(c)) {
404 UChar c16 = (UChar)c;
405 destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
406 } else {
407 UChar surrogate[2];
408 surrogate[0] = U16_LEAD(c);
409 surrogate[1] = U16_TRAIL(c);
410 if (U_SUCCESS(status)) {
411 destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);
412 }
413 }
414 } else {
415 // We've got a $. Pick up a capture group number if one follows.
416 // Consume at most the number of digits necessary for the largest capture
417 // number that is valid for this pattern.
418
419 int32_t numDigits = 0;
420 int32_t groupNum = 0;
421 UChar32 digitC;
422 for (;;) {
423 digitC = UTEXT_CURRENT32(replacement);
424 if (digitC == U_SENTINEL) {
425 break;
426 }
427 if (u_isdigit(digitC) == FALSE) {
428 break;
429 }
430 (void)UTEXT_NEXT32(replacement);
431 groupNum=groupNum*10 + u_charDigitValue(digitC);
432 numDigits++;
433 if (numDigits >= fPattern->fMaxCaptureDigits) {
434 break;
435 }
436 }
437
438
439 if (numDigits == 0) {
440 // The $ didn't introduce a group number at all.
441 // Treat it as just part of the substitution text.
442 UChar c16 = DOLLARSIGN;
443 destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
444 } else {
445 // Finally, append the capture group data to the destination.
446 destLen += appendGroup(groupNum, dest, status);
447 if (U_FAILURE(status)) {
448 // Can fail if group number is out of range.
449 break;
450 }
451 }
452 }
453
454 if (U_FAILURE(status)) {
455 break;
456 } else {
457 c = UTEXT_NEXT32(replacement);
458 }
459 }
460
461 return *this;
462 }
463
464
465
466 //--------------------------------------------------------------------------------
467 //
468 // appendTail Intended to be used in conjunction with appendReplacement()
469 // To the destination string, append everything following
470 // the last match position from the input string.
471 //
472 // Note: Match ranges do not affect appendTail or appendReplacement
473 //
474 //--------------------------------------------------------------------------------
appendTail(UnicodeString & dest)475 UnicodeString &RegexMatcher::appendTail(UnicodeString &dest) {
476 UErrorCode status = U_ZERO_ERROR;
477 UText resultText = UTEXT_INITIALIZER;
478 utext_openUnicodeString(&resultText, &dest, &status);
479
480 if (U_SUCCESS(status)) {
481 appendTail(&resultText, status);
482 utext_close(&resultText);
483 }
484
485 return dest;
486 }
487
488 //
489 // appendTail, UText mode
490 //
appendTail(UText * dest,UErrorCode & status)491 UText *RegexMatcher::appendTail(UText *dest, UErrorCode &status) {
492 UBool bailOut = FALSE;
493 if (U_FAILURE(status)) {
494 bailOut = TRUE;
495 }
496 if (U_FAILURE(fDeferredStatus)) {
497 status = fDeferredStatus;
498 bailOut = TRUE;
499 }
500
501 if (bailOut) {
502 // dest must not be NULL
503 if (dest) {
504 utext_replace(dest, utext_nativeLength(dest), utext_nativeLength(dest), NULL, 0, &status);
505 return dest;
506 }
507 }
508
509 if (fInputLength > fAppendPosition) {
510 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
511 int64_t destLen = utext_nativeLength(dest);
512 utext_replace(dest, destLen, destLen, fInputText->chunkContents+fAppendPosition,
513 (int32_t)(fInputLength-fAppendPosition), &status);
514 } else {
515 int32_t len16;
516 if (UTEXT_USES_U16(fInputText)) {
517 len16 = (int32_t)(fInputLength-fAppendPosition);
518 } else {
519 len16 = utext_extract(fInputText, fAppendPosition, fInputLength, NULL, 0, &status);
520 status = U_ZERO_ERROR; // buffer overflow
521 }
522
523 UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16));
524 if (inputChars == NULL) {
525 fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
526 } else {
527 utext_extract(fInputText, fAppendPosition, fInputLength, inputChars, len16, &status); // unterminated
528 int64_t destLen = utext_nativeLength(dest);
529 utext_replace(dest, destLen, destLen, inputChars, len16, &status);
530 uprv_free(inputChars);
531 }
532 }
533 }
534 return dest;
535 }
536
537
538
539 //--------------------------------------------------------------------------------
540 //
541 // end
542 //
543 //--------------------------------------------------------------------------------
end(UErrorCode & err) const544 int32_t RegexMatcher::end(UErrorCode &err) const {
545 return end(0, err);
546 }
547
end64(UErrorCode & err) const548 int64_t RegexMatcher::end64(UErrorCode &err) const {
549 return end64(0, err);
550 }
551
end64(int32_t group,UErrorCode & err) const552 int64_t RegexMatcher::end64(int32_t group, UErrorCode &err) const {
553 if (U_FAILURE(err)) {
554 return -1;
555 }
556 if (fMatch == FALSE) {
557 err = U_REGEX_INVALID_STATE;
558 return -1;
559 }
560 if (group < 0 || group > fPattern->fGroupMap->size()) {
561 err = U_INDEX_OUTOFBOUNDS_ERROR;
562 return -1;
563 }
564 int64_t e = -1;
565 if (group == 0) {
566 e = fMatchEnd;
567 } else {
568 // Get the position within the stack frame of the variables for
569 // this capture group.
570 int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1);
571 U_ASSERT(groupOffset < fPattern->fFrameSize);
572 U_ASSERT(groupOffset >= 0);
573 e = fFrame->fExtra[groupOffset + 1];
574 }
575
576 return e;
577 }
578
end(int32_t group,UErrorCode & err) const579 int32_t RegexMatcher::end(int32_t group, UErrorCode &err) const {
580 return (int32_t)end64(group, err);
581 }
582
583
584 //--------------------------------------------------------------------------------
585 //
586 // find()
587 //
588 //--------------------------------------------------------------------------------
find()589 UBool RegexMatcher::find() {
590 // Start at the position of the last match end. (Will be zero if the
591 // matcher has been reset.)
592 //
593 if (U_FAILURE(fDeferredStatus)) {
594 return FALSE;
595 }
596
597 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
598 return findUsingChunk();
599 }
600
601 int64_t startPos = fMatchEnd;
602 if (startPos==0) {
603 startPos = fActiveStart;
604 }
605
606 if (fMatch) {
607 // Save the position of any previous successful match.
608 fLastMatchEnd = fMatchEnd;
609
610 if (fMatchStart == fMatchEnd) {
611 // Previous match had zero length. Move start position up one position
612 // to avoid sending find() into a loop on zero-length matches.
613 if (startPos >= fActiveLimit) {
614 fMatch = FALSE;
615 fHitEnd = TRUE;
616 return FALSE;
617 }
618 UTEXT_SETNATIVEINDEX(fInputText, startPos);
619 (void)UTEXT_NEXT32(fInputText);
620 startPos = UTEXT_GETNATIVEINDEX(fInputText);
621 }
622 } else {
623 if (fLastMatchEnd >= 0) {
624 // A previous find() failed to match. Don't try again.
625 // (without this test, a pattern with a zero-length match
626 // could match again at the end of an input string.)
627 fHitEnd = TRUE;
628 return FALSE;
629 }
630 }
631
632
633 // Compute the position in the input string beyond which a match can not begin, because
634 // the minimum length match would extend past the end of the input.
635 // Note: some patterns that cannot match anything will have fMinMatchLength==Max Int.
636 // Be aware of possible overflows if making changes here.
637 int64_t testStartLimit;
638 if (UTEXT_USES_U16(fInputText)) {
639 testStartLimit = fActiveLimit - fPattern->fMinMatchLen;
640 if (startPos > testStartLimit) {
641 fMatch = FALSE;
642 fHitEnd = TRUE;
643 return FALSE;
644 }
645 } else {
646 // For now, let the matcher discover that it can't match on its own
647 // We don't know how long the match len is in native characters
648 testStartLimit = fActiveLimit;
649 }
650
651 UChar32 c;
652 U_ASSERT(startPos >= 0);
653
654 switch (fPattern->fStartType) {
655 case START_NO_INFO:
656 // No optimization was found.
657 // Try a match at each input position.
658 for (;;) {
659 MatchAt(startPos, FALSE, fDeferredStatus);
660 if (U_FAILURE(fDeferredStatus)) {
661 return FALSE;
662 }
663 if (fMatch) {
664 return TRUE;
665 }
666 if (startPos >= testStartLimit) {
667 fHitEnd = TRUE;
668 return FALSE;
669 }
670 UTEXT_SETNATIVEINDEX(fInputText, startPos);
671 (void)UTEXT_NEXT32(fInputText);
672 startPos = UTEXT_GETNATIVEINDEX(fInputText);
673 // Note that it's perfectly OK for a pattern to have a zero-length
674 // match at the end of a string, so we must make sure that the loop
675 // runs with startPos == testStartLimit the last time through.
676 if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
677 return FALSE;
678 }
679 U_ASSERT(FALSE);
680
681 case START_START:
682 // Matches are only possible at the start of the input string
683 // (pattern begins with ^ or \A)
684 if (startPos > fActiveStart) {
685 fMatch = FALSE;
686 return FALSE;
687 }
688 MatchAt(startPos, FALSE, fDeferredStatus);
689 if (U_FAILURE(fDeferredStatus)) {
690 return FALSE;
691 }
692 return fMatch;
693
694
695 case START_SET:
696 {
697 // Match may start on any char from a pre-computed set.
698 U_ASSERT(fPattern->fMinMatchLen > 0);
699 int64_t pos;
700 UTEXT_SETNATIVEINDEX(fInputText, startPos);
701 for (;;) {
702 c = UTEXT_NEXT32(fInputText);
703 pos = UTEXT_GETNATIVEINDEX(fInputText);
704 // c will be -1 (U_SENTINEL) at end of text, in which case we
705 // skip this next block (so we don't have a negative array index)
706 // and handle end of text in the following block.
707 if (c >= 0 && ((c<256 && fPattern->fInitialChars8->contains(c)) ||
708 (c>=256 && fPattern->fInitialChars->contains(c)))) {
709 MatchAt(startPos, FALSE, fDeferredStatus);
710 if (U_FAILURE(fDeferredStatus)) {
711 return FALSE;
712 }
713 if (fMatch) {
714 return TRUE;
715 }
716 UTEXT_SETNATIVEINDEX(fInputText, pos);
717 }
718 if (startPos >= testStartLimit) {
719 fMatch = FALSE;
720 fHitEnd = TRUE;
721 return FALSE;
722 }
723 startPos = pos;
724 if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
725 return FALSE;
726 }
727 }
728 U_ASSERT(FALSE);
729
730 case START_STRING:
731 case START_CHAR:
732 {
733 // Match starts on exactly one char.
734 U_ASSERT(fPattern->fMinMatchLen > 0);
735 UChar32 theChar = fPattern->fInitialChar;
736 int64_t pos;
737 UTEXT_SETNATIVEINDEX(fInputText, startPos);
738 for (;;) {
739 c = UTEXT_NEXT32(fInputText);
740 pos = UTEXT_GETNATIVEINDEX(fInputText);
741 if (c == theChar) {
742 MatchAt(startPos, FALSE, fDeferredStatus);
743 if (U_FAILURE(fDeferredStatus)) {
744 return FALSE;
745 }
746 if (fMatch) {
747 return TRUE;
748 }
749 UTEXT_SETNATIVEINDEX(fInputText, pos);
750 }
751 if (startPos >= testStartLimit) {
752 fMatch = FALSE;
753 fHitEnd = TRUE;
754 return FALSE;
755 }
756 startPos = pos;
757 if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
758 return FALSE;
759 }
760 }
761 U_ASSERT(FALSE);
762
763 case START_LINE:
764 {
765 UChar32 c;
766 if (startPos == fAnchorStart) {
767 MatchAt(startPos, FALSE, fDeferredStatus);
768 if (U_FAILURE(fDeferredStatus)) {
769 return FALSE;
770 }
771 if (fMatch) {
772 return TRUE;
773 }
774 UTEXT_SETNATIVEINDEX(fInputText, startPos);
775 c = UTEXT_NEXT32(fInputText);
776 startPos = UTEXT_GETNATIVEINDEX(fInputText);
777 } else {
778 UTEXT_SETNATIVEINDEX(fInputText, startPos);
779 c = UTEXT_PREVIOUS32(fInputText);
780 UTEXT_SETNATIVEINDEX(fInputText, startPos);
781 }
782
783 if (fPattern->fFlags & UREGEX_UNIX_LINES) {
784 for (;;) {
785 if (c == 0x0a) {
786 MatchAt(startPos, FALSE, fDeferredStatus);
787 if (U_FAILURE(fDeferredStatus)) {
788 return FALSE;
789 }
790 if (fMatch) {
791 return TRUE;
792 }
793 UTEXT_SETNATIVEINDEX(fInputText, startPos);
794 }
795 if (startPos >= testStartLimit) {
796 fMatch = FALSE;
797 fHitEnd = TRUE;
798 return FALSE;
799 }
800 c = UTEXT_NEXT32(fInputText);
801 startPos = UTEXT_GETNATIVEINDEX(fInputText);
802 // Note that it's perfectly OK for a pattern to have a zero-length
803 // match at the end of a string, so we must make sure that the loop
804 // runs with startPos == testStartLimit the last time through.
805 if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
806 return FALSE;
807 }
808 } else {
809 for (;;) {
810 if (((c & 0x7f) <= 0x29) && // First quickly bypass as many chars as possible
811 ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029 )) {
812 if (c == 0x0d && startPos < fActiveLimit && UTEXT_CURRENT32(fInputText) == 0x0a) {
813 (void)UTEXT_NEXT32(fInputText);
814 startPos = UTEXT_GETNATIVEINDEX(fInputText);
815 }
816 MatchAt(startPos, FALSE, fDeferredStatus);
817 if (U_FAILURE(fDeferredStatus)) {
818 return FALSE;
819 }
820 if (fMatch) {
821 return TRUE;
822 }
823 UTEXT_SETNATIVEINDEX(fInputText, startPos);
824 }
825 if (startPos >= testStartLimit) {
826 fMatch = FALSE;
827 fHitEnd = TRUE;
828 return FALSE;
829 }
830 c = UTEXT_NEXT32(fInputText);
831 startPos = UTEXT_GETNATIVEINDEX(fInputText);
832 // Note that it's perfectly OK for a pattern to have a zero-length
833 // match at the end of a string, so we must make sure that the loop
834 // runs with startPos == testStartLimit the last time through.
835 if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
836 return FALSE;
837 }
838 }
839 }
840
841 default:
842 U_ASSERT(FALSE);
843 }
844
845 U_ASSERT(FALSE);
846 return FALSE;
847 }
848
849
850
find(int64_t start,UErrorCode & status)851 UBool RegexMatcher::find(int64_t start, UErrorCode &status) {
852 if (U_FAILURE(status)) {
853 return FALSE;
854 }
855 if (U_FAILURE(fDeferredStatus)) {
856 status = fDeferredStatus;
857 return FALSE;
858 }
859 this->reset(); // Note: Reset() is specified by Java Matcher documentation.
860 // This will reset the region to be the full input length.
861 if (start < 0) {
862 status = U_INDEX_OUTOFBOUNDS_ERROR;
863 return FALSE;
864 }
865
866 int64_t nativeStart = start;
867 if (nativeStart < fActiveStart || nativeStart > fActiveLimit) {
868 status = U_INDEX_OUTOFBOUNDS_ERROR;
869 return FALSE;
870 }
871 fMatchEnd = nativeStart;
872 return find();
873 }
874
875
876 //--------------------------------------------------------------------------------
877 //
878 // findUsingChunk() -- like find(), but with the advance knowledge that the
879 // entire string is available in the UText's chunk buffer.
880 //
881 //--------------------------------------------------------------------------------
findUsingChunk()882 UBool RegexMatcher::findUsingChunk() {
883 // Start at the position of the last match end. (Will be zero if the
884 // matcher has been reset.
885 //
886
887 int32_t startPos = (int32_t)fMatchEnd;
888 if (startPos==0) {
889 startPos = (int32_t)fActiveStart;
890 }
891
892 const UChar *inputBuf = fInputText->chunkContents;
893
894 if (fMatch) {
895 // Save the position of any previous successful match.
896 fLastMatchEnd = fMatchEnd;
897
898 if (fMatchStart == fMatchEnd) {
899 // Previous match had zero length. Move start position up one position
900 // to avoid sending find() into a loop on zero-length matches.
901 if (startPos >= fActiveLimit) {
902 fMatch = FALSE;
903 fHitEnd = TRUE;
904 return FALSE;
905 }
906 U16_FWD_1(inputBuf, startPos, fInputLength);
907 }
908 } else {
909 if (fLastMatchEnd >= 0) {
910 // A previous find() failed to match. Don't try again.
911 // (without this test, a pattern with a zero-length match
912 // could match again at the end of an input string.)
913 fHitEnd = TRUE;
914 return FALSE;
915 }
916 }
917
918
919 // Compute the position in the input string beyond which a match can not begin, because
920 // the minimum length match would extend past the end of the input.
921 // Note: some patterns that cannot match anything will have fMinMatchLength==Max Int.
922 // Be aware of possible overflows if making changes here.
923 int32_t testLen = (int32_t)(fActiveLimit - fPattern->fMinMatchLen);
924 if (startPos > testLen) {
925 fMatch = FALSE;
926 fHitEnd = TRUE;
927 return FALSE;
928 }
929
930 UChar32 c;
931 U_ASSERT(startPos >= 0);
932
933 switch (fPattern->fStartType) {
934 case START_NO_INFO:
935 // No optimization was found.
936 // Try a match at each input position.
937 for (;;) {
938 MatchChunkAt(startPos, FALSE, fDeferredStatus);
939 if (U_FAILURE(fDeferredStatus)) {
940 return FALSE;
941 }
942 if (fMatch) {
943 return TRUE;
944 }
945 if (startPos >= testLen) {
946 fHitEnd = TRUE;
947 return FALSE;
948 }
949 U16_FWD_1(inputBuf, startPos, fActiveLimit);
950 // Note that it's perfectly OK for a pattern to have a zero-length
951 // match at the end of a string, so we must make sure that the loop
952 // runs with startPos == testLen the last time through.
953 if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
954 return FALSE;
955 }
956 U_ASSERT(FALSE);
957
958 case START_START:
959 // Matches are only possible at the start of the input string
960 // (pattern begins with ^ or \A)
961 if (startPos > fActiveStart) {
962 fMatch = FALSE;
963 return FALSE;
964 }
965 MatchChunkAt(startPos, FALSE, fDeferredStatus);
966 if (U_FAILURE(fDeferredStatus)) {
967 return FALSE;
968 }
969 return fMatch;
970
971
972 case START_SET:
973 {
974 // Match may start on any char from a pre-computed set.
975 U_ASSERT(fPattern->fMinMatchLen > 0);
976 for (;;) {
977 int32_t pos = startPos;
978 U16_NEXT(inputBuf, startPos, fActiveLimit, c); // like c = inputBuf[startPos++];
979 if ((c<256 && fPattern->fInitialChars8->contains(c)) ||
980 (c>=256 && fPattern->fInitialChars->contains(c))) {
981 MatchChunkAt(pos, FALSE, fDeferredStatus);
982 if (U_FAILURE(fDeferredStatus)) {
983 return FALSE;
984 }
985 if (fMatch) {
986 return TRUE;
987 }
988 }
989 if (pos >= testLen) {
990 fMatch = FALSE;
991 fHitEnd = TRUE;
992 return FALSE;
993 }
994 if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
995 return FALSE;
996 }
997 }
998 U_ASSERT(FALSE);
999
1000 case START_STRING:
1001 case START_CHAR:
1002 {
1003 // Match starts on exactly one char.
1004 U_ASSERT(fPattern->fMinMatchLen > 0);
1005 UChar32 theChar = fPattern->fInitialChar;
1006 for (;;) {
1007 int32_t pos = startPos;
1008 U16_NEXT(inputBuf, startPos, fActiveLimit, c); // like c = inputBuf[startPos++];
1009 if (c == theChar) {
1010 MatchChunkAt(pos, FALSE, fDeferredStatus);
1011 if (U_FAILURE(fDeferredStatus)) {
1012 return FALSE;
1013 }
1014 if (fMatch) {
1015 return TRUE;
1016 }
1017 }
1018 if (pos >= testLen) {
1019 fMatch = FALSE;
1020 fHitEnd = TRUE;
1021 return FALSE;
1022 }
1023 if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
1024 return FALSE;
1025 }
1026 }
1027 U_ASSERT(FALSE);
1028
1029 case START_LINE:
1030 {
1031 UChar32 c;
1032 if (startPos == fAnchorStart) {
1033 MatchChunkAt(startPos, FALSE, fDeferredStatus);
1034 if (U_FAILURE(fDeferredStatus)) {
1035 return FALSE;
1036 }
1037 if (fMatch) {
1038 return TRUE;
1039 }
1040 U16_FWD_1(inputBuf, startPos, fActiveLimit);
1041 }
1042
1043 if (fPattern->fFlags & UREGEX_UNIX_LINES) {
1044 for (;;) {
1045 c = inputBuf[startPos-1];
1046 if (c == 0x0a) {
1047 MatchChunkAt(startPos, FALSE, fDeferredStatus);
1048 if (U_FAILURE(fDeferredStatus)) {
1049 return FALSE;
1050 }
1051 if (fMatch) {
1052 return TRUE;
1053 }
1054 }
1055 if (startPos >= testLen) {
1056 fMatch = FALSE;
1057 fHitEnd = TRUE;
1058 return FALSE;
1059 }
1060 U16_FWD_1(inputBuf, startPos, fActiveLimit);
1061 // Note that it's perfectly OK for a pattern to have a zero-length
1062 // match at the end of a string, so we must make sure that the loop
1063 // runs with startPos == testLen the last time through.
1064 if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
1065 return FALSE;
1066 }
1067 } else {
1068 for (;;) {
1069 c = inputBuf[startPos-1];
1070 if (((c & 0x7f) <= 0x29) && // First quickly bypass as many chars as possible
1071 ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029 )) {
1072 if (c == 0x0d && startPos < fActiveLimit && inputBuf[startPos] == 0x0a) {
1073 startPos++;
1074 }
1075 MatchChunkAt(startPos, FALSE, fDeferredStatus);
1076 if (U_FAILURE(fDeferredStatus)) {
1077 return FALSE;
1078 }
1079 if (fMatch) {
1080 return TRUE;
1081 }
1082 }
1083 if (startPos >= testLen) {
1084 fMatch = FALSE;
1085 fHitEnd = TRUE;
1086 return FALSE;
1087 }
1088 U16_FWD_1(inputBuf, startPos, fActiveLimit);
1089 // Note that it's perfectly OK for a pattern to have a zero-length
1090 // match at the end of a string, so we must make sure that the loop
1091 // runs with startPos == testLen the last time through.
1092 if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
1093 return FALSE;
1094 }
1095 }
1096 }
1097
1098 default:
1099 U_ASSERT(FALSE);
1100 }
1101
1102 U_ASSERT(FALSE);
1103 return FALSE;
1104 }
1105
1106
1107
1108 //--------------------------------------------------------------------------------
1109 //
1110 // group()
1111 //
1112 //--------------------------------------------------------------------------------
group(UErrorCode & status) const1113 UnicodeString RegexMatcher::group(UErrorCode &status) const {
1114 return group(0, status);
1115 }
1116
1117 // Return immutable shallow clone
group(UText * dest,int64_t & group_len,UErrorCode & status) const1118 UText *RegexMatcher::group(UText *dest, int64_t &group_len, UErrorCode &status) const {
1119 return group(0, dest, group_len, status);
1120 }
1121
1122 // Return immutable shallow clone
group(int32_t groupNum,UText * dest,int64_t & group_len,UErrorCode & status) const1123 UText *RegexMatcher::group(int32_t groupNum, UText *dest, int64_t &group_len, UErrorCode &status) const {
1124 group_len = 0;
1125 UBool bailOut = FALSE;
1126 if (U_FAILURE(status)) {
1127 return dest;
1128 }
1129 if (U_FAILURE(fDeferredStatus)) {
1130 status = fDeferredStatus;
1131 bailOut = TRUE;
1132 }
1133 if (fMatch == FALSE) {
1134 status = U_REGEX_INVALID_STATE;
1135 bailOut = TRUE;
1136 }
1137 if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) {
1138 status = U_INDEX_OUTOFBOUNDS_ERROR;
1139 bailOut = TRUE;
1140 }
1141
1142 if (bailOut) {
1143 return (dest) ? dest : utext_openUChars(NULL, NULL, 0, &status);
1144 }
1145
1146 int64_t s, e;
1147 if (groupNum == 0) {
1148 s = fMatchStart;
1149 e = fMatchEnd;
1150 } else {
1151 int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1);
1152 U_ASSERT(groupOffset < fPattern->fFrameSize);
1153 U_ASSERT(groupOffset >= 0);
1154 s = fFrame->fExtra[groupOffset];
1155 e = fFrame->fExtra[groupOffset+1];
1156 }
1157
1158 if (s < 0) {
1159 // A capture group wasn't part of the match
1160 return utext_clone(dest, fInputText, FALSE, TRUE, &status);
1161 }
1162 U_ASSERT(s <= e);
1163 group_len = e - s;
1164
1165 dest = utext_clone(dest, fInputText, FALSE, TRUE, &status);
1166 if (dest)
1167 UTEXT_SETNATIVEINDEX(dest, s);
1168 return dest;
1169 }
1170
group(int32_t groupNum,UErrorCode & status) const1171 UnicodeString RegexMatcher::group(int32_t groupNum, UErrorCode &status) const {
1172 UnicodeString result;
1173 if (U_FAILURE(status)) {
1174 return result;
1175 }
1176 UText resultText = UTEXT_INITIALIZER;
1177 utext_openUnicodeString(&resultText, &result, &status);
1178 group(groupNum, &resultText, status);
1179 utext_close(&resultText);
1180 return result;
1181 }
1182
1183
1184 // Return deep (mutable) clone
1185 // Technology Preview (as an API), but note that the UnicodeString API is implemented
1186 // using this function.
group(int32_t groupNum,UText * dest,UErrorCode & status) const1187 UText *RegexMatcher::group(int32_t groupNum, UText *dest, UErrorCode &status) const {
1188 UBool bailOut = FALSE;
1189 if (U_FAILURE(status)) {
1190 return dest;
1191 }
1192 if (U_FAILURE(fDeferredStatus)) {
1193 status = fDeferredStatus;
1194 bailOut = TRUE;
1195 }
1196
1197 if (fMatch == FALSE) {
1198 status = U_REGEX_INVALID_STATE;
1199 bailOut = TRUE;
1200 }
1201 if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) {
1202 status = U_INDEX_OUTOFBOUNDS_ERROR;
1203 bailOut = TRUE;
1204 }
1205
1206 if (bailOut) {
1207 if (dest) {
1208 utext_replace(dest, 0, utext_nativeLength(dest), NULL, 0, &status);
1209 return dest;
1210 } else {
1211 return utext_openUChars(NULL, NULL, 0, &status);
1212 }
1213 }
1214
1215 int64_t s, e;
1216 if (groupNum == 0) {
1217 s = fMatchStart;
1218 e = fMatchEnd;
1219 } else {
1220 int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1);
1221 U_ASSERT(groupOffset < fPattern->fFrameSize);
1222 U_ASSERT(groupOffset >= 0);
1223 s = fFrame->fExtra[groupOffset];
1224 e = fFrame->fExtra[groupOffset+1];
1225 }
1226
1227 if (s < 0) {
1228 // A capture group wasn't part of the match
1229 if (dest) {
1230 utext_replace(dest, 0, utext_nativeLength(dest), NULL, 0, &status);
1231 return dest;
1232 } else {
1233 return utext_openUChars(NULL, NULL, 0, &status);
1234 }
1235 }
1236 U_ASSERT(s <= e);
1237
1238 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1239 U_ASSERT(e <= fInputLength);
1240 if (dest) {
1241 utext_replace(dest, 0, utext_nativeLength(dest), fInputText->chunkContents+s, (int32_t)(e-s), &status);
1242 } else {
1243 UText groupText = UTEXT_INITIALIZER;
1244 utext_openUChars(&groupText, fInputText->chunkContents+s, e-s, &status);
1245 dest = utext_clone(NULL, &groupText, TRUE, FALSE, &status);
1246 utext_close(&groupText);
1247 }
1248 } else {
1249 int32_t len16;
1250 if (UTEXT_USES_U16(fInputText)) {
1251 len16 = (int32_t)(e-s);
1252 } else {
1253 UErrorCode lengthStatus = U_ZERO_ERROR;
1254 len16 = utext_extract(fInputText, s, e, NULL, 0, &lengthStatus);
1255 }
1256 UChar *groupChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1));
1257 if (groupChars == NULL) {
1258 status = U_MEMORY_ALLOCATION_ERROR;
1259 return dest;
1260 }
1261 utext_extract(fInputText, s, e, groupChars, len16+1, &status);
1262
1263 if (dest) {
1264 utext_replace(dest, 0, utext_nativeLength(dest), groupChars, len16, &status);
1265 } else {
1266 UText groupText = UTEXT_INITIALIZER;
1267 utext_openUChars(&groupText, groupChars, len16, &status);
1268 dest = utext_clone(NULL, &groupText, TRUE, FALSE, &status);
1269 utext_close(&groupText);
1270 }
1271
1272 uprv_free(groupChars);
1273 }
1274 return dest;
1275 }
1276
1277 //--------------------------------------------------------------------------------
1278 //
1279 // appendGroup() -- currently internal only, appends a group to a UText rather
1280 // than replacing its contents
1281 //
1282 //--------------------------------------------------------------------------------
1283
appendGroup(int32_t groupNum,UText * dest,UErrorCode & status) const1284 int64_t RegexMatcher::appendGroup(int32_t groupNum, UText *dest, UErrorCode &status) const {
1285 if (U_FAILURE(status)) {
1286 return 0;
1287 }
1288 if (U_FAILURE(fDeferredStatus)) {
1289 status = fDeferredStatus;
1290 return 0;
1291 }
1292 int64_t destLen = utext_nativeLength(dest);
1293
1294 if (fMatch == FALSE) {
1295 status = U_REGEX_INVALID_STATE;
1296 return utext_replace(dest, destLen, destLen, NULL, 0, &status);
1297 }
1298 if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) {
1299 status = U_INDEX_OUTOFBOUNDS_ERROR;
1300 return utext_replace(dest, destLen, destLen, NULL, 0, &status);
1301 }
1302
1303 int64_t s, e;
1304 if (groupNum == 0) {
1305 s = fMatchStart;
1306 e = fMatchEnd;
1307 } else {
1308 int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1);
1309 U_ASSERT(groupOffset < fPattern->fFrameSize);
1310 U_ASSERT(groupOffset >= 0);
1311 s = fFrame->fExtra[groupOffset];
1312 e = fFrame->fExtra[groupOffset+1];
1313 }
1314
1315 if (s < 0) {
1316 // A capture group wasn't part of the match
1317 return utext_replace(dest, destLen, destLen, NULL, 0, &status);
1318 }
1319 U_ASSERT(s <= e);
1320
1321 int64_t deltaLen;
1322 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1323 U_ASSERT(e <= fInputLength);
1324 deltaLen = utext_replace(dest, destLen, destLen, fInputText->chunkContents+s, (int32_t)(e-s), &status);
1325 } else {
1326 int32_t len16;
1327 if (UTEXT_USES_U16(fInputText)) {
1328 len16 = (int32_t)(e-s);
1329 } else {
1330 UErrorCode lengthStatus = U_ZERO_ERROR;
1331 len16 = utext_extract(fInputText, s, e, NULL, 0, &lengthStatus);
1332 }
1333 UChar *groupChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1));
1334 if (groupChars == NULL) {
1335 status = U_MEMORY_ALLOCATION_ERROR;
1336 return 0;
1337 }
1338 utext_extract(fInputText, s, e, groupChars, len16+1, &status);
1339
1340 deltaLen = utext_replace(dest, destLen, destLen, groupChars, len16, &status);
1341 uprv_free(groupChars);
1342 }
1343 return deltaLen;
1344 }
1345
1346
1347
1348 //--------------------------------------------------------------------------------
1349 //
1350 // groupCount()
1351 //
1352 //--------------------------------------------------------------------------------
groupCount() const1353 int32_t RegexMatcher::groupCount() const {
1354 return fPattern->fGroupMap->size();
1355 }
1356
1357
1358
1359 //--------------------------------------------------------------------------------
1360 //
1361 // hasAnchoringBounds()
1362 //
1363 //--------------------------------------------------------------------------------
hasAnchoringBounds() const1364 UBool RegexMatcher::hasAnchoringBounds() const {
1365 return fAnchoringBounds;
1366 }
1367
1368
1369 //--------------------------------------------------------------------------------
1370 //
1371 // hasTransparentBounds()
1372 //
1373 //--------------------------------------------------------------------------------
hasTransparentBounds() const1374 UBool RegexMatcher::hasTransparentBounds() const {
1375 return fTransparentBounds;
1376 }
1377
1378
1379
1380 //--------------------------------------------------------------------------------
1381 //
1382 // hitEnd()
1383 //
1384 //--------------------------------------------------------------------------------
hitEnd() const1385 UBool RegexMatcher::hitEnd() const {
1386 return fHitEnd;
1387 }
1388
1389
1390 //--------------------------------------------------------------------------------
1391 //
1392 // input()
1393 //
1394 //--------------------------------------------------------------------------------
input() const1395 const UnicodeString &RegexMatcher::input() const {
1396 if (!fInput) {
1397 UErrorCode status = U_ZERO_ERROR;
1398 int32_t len16;
1399 if (UTEXT_USES_U16(fInputText)) {
1400 len16 = (int32_t)fInputLength;
1401 } else {
1402 len16 = utext_extract(fInputText, 0, fInputLength, NULL, 0, &status);
1403 status = U_ZERO_ERROR; // overflow, length status
1404 }
1405 UnicodeString *result = new UnicodeString(len16, 0, 0);
1406
1407 UChar *inputChars = result->getBuffer(len16);
1408 utext_extract(fInputText, 0, fInputLength, inputChars, len16, &status); // unterminated warning
1409 result->releaseBuffer(len16);
1410
1411 (*(const UnicodeString **)&fInput) = result; // pointer assignment, rather than operator=
1412 }
1413
1414 return *fInput;
1415 }
1416
1417 //--------------------------------------------------------------------------------
1418 //
1419 // inputText()
1420 //
1421 //--------------------------------------------------------------------------------
inputText() const1422 UText *RegexMatcher::inputText() const {
1423 return fInputText;
1424 }
1425
1426
1427 //--------------------------------------------------------------------------------
1428 //
1429 // getInput() -- like inputText(), but makes a clone or copies into another UText
1430 //
1431 //--------------------------------------------------------------------------------
getInput(UText * dest,UErrorCode & status) const1432 UText *RegexMatcher::getInput (UText *dest, UErrorCode &status) const {
1433 UBool bailOut = FALSE;
1434 if (U_FAILURE(status)) {
1435 return dest;
1436 }
1437 if (U_FAILURE(fDeferredStatus)) {
1438 status = fDeferredStatus;
1439 bailOut = TRUE;
1440 }
1441
1442 if (bailOut) {
1443 if (dest) {
1444 utext_replace(dest, 0, utext_nativeLength(dest), NULL, 0, &status);
1445 return dest;
1446 } else {
1447 return utext_clone(NULL, fInputText, FALSE, TRUE, &status);
1448 }
1449 }
1450
1451 if (dest) {
1452 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1453 utext_replace(dest, 0, utext_nativeLength(dest), fInputText->chunkContents, (int32_t)fInputLength, &status);
1454 } else {
1455 int32_t input16Len;
1456 if (UTEXT_USES_U16(fInputText)) {
1457 input16Len = (int32_t)fInputLength;
1458 } else {
1459 UErrorCode lengthStatus = U_ZERO_ERROR;
1460 input16Len = utext_extract(fInputText, 0, fInputLength, NULL, 0, &lengthStatus); // buffer overflow error
1461 }
1462 UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(input16Len));
1463 if (inputChars == NULL) {
1464 return dest;
1465 }
1466
1467 status = U_ZERO_ERROR;
1468 utext_extract(fInputText, 0, fInputLength, inputChars, input16Len, &status); // not terminated warning
1469 status = U_ZERO_ERROR;
1470 utext_replace(dest, 0, utext_nativeLength(dest), inputChars, input16Len, &status);
1471
1472 uprv_free(inputChars);
1473 }
1474 return dest;
1475 } else {
1476 return utext_clone(NULL, fInputText, FALSE, TRUE, &status);
1477 }
1478 }
1479
1480
1481 static UBool compat_SyncMutableUTextContents(UText *ut);
compat_SyncMutableUTextContents(UText * ut)1482 static UBool compat_SyncMutableUTextContents(UText *ut) {
1483 UBool retVal = FALSE;
1484
1485 // In the following test, we're really only interested in whether the UText should switch
1486 // between heap and stack allocation. If length hasn't changed, we won't, so the chunkContents
1487 // will still point to the correct data.
1488 if (utext_nativeLength(ut) != ut->nativeIndexingLimit) {
1489 UnicodeString *us=(UnicodeString *)ut->context;
1490
1491 // Update to the latest length.
1492 // For example, (utext_nativeLength(ut) != ut->nativeIndexingLimit).
1493 int32_t newLength = us->length();
1494
1495 // Update the chunk description.
1496 // The buffer may have switched between stack- and heap-based.
1497 ut->chunkContents = us->getBuffer();
1498 ut->chunkLength = newLength;
1499 ut->chunkNativeLimit = newLength;
1500 ut->nativeIndexingLimit = newLength;
1501 retVal = TRUE;
1502 }
1503
1504 return retVal;
1505 }
1506
1507 //--------------------------------------------------------------------------------
1508 //
1509 // lookingAt()
1510 //
1511 //--------------------------------------------------------------------------------
lookingAt(UErrorCode & status)1512 UBool RegexMatcher::lookingAt(UErrorCode &status) {
1513 if (U_FAILURE(status)) {
1514 return FALSE;
1515 }
1516 if (U_FAILURE(fDeferredStatus)) {
1517 status = fDeferredStatus;
1518 return FALSE;
1519 }
1520
1521 if (fInputUniStrMaybeMutable) {
1522 if (compat_SyncMutableUTextContents(fInputText)) {
1523 fInputLength = utext_nativeLength(fInputText);
1524 reset();
1525 }
1526 }
1527 else {
1528 resetPreserveRegion();
1529 }
1530 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1531 MatchChunkAt((int32_t)fActiveStart, FALSE, status);
1532 } else {
1533 MatchAt(fActiveStart, FALSE, status);
1534 }
1535 return fMatch;
1536 }
1537
1538
lookingAt(int64_t start,UErrorCode & status)1539 UBool RegexMatcher::lookingAt(int64_t start, UErrorCode &status) {
1540 if (U_FAILURE(status)) {
1541 return FALSE;
1542 }
1543 if (U_FAILURE(fDeferredStatus)) {
1544 status = fDeferredStatus;
1545 return FALSE;
1546 }
1547 reset();
1548
1549 if (start < 0) {
1550 status = U_INDEX_OUTOFBOUNDS_ERROR;
1551 return FALSE;
1552 }
1553
1554 if (fInputUniStrMaybeMutable) {
1555 if (compat_SyncMutableUTextContents(fInputText)) {
1556 fInputLength = utext_nativeLength(fInputText);
1557 reset();
1558 }
1559 }
1560
1561 int64_t nativeStart;
1562 nativeStart = start;
1563 if (nativeStart < fActiveStart || nativeStart > fActiveLimit) {
1564 status = U_INDEX_OUTOFBOUNDS_ERROR;
1565 return FALSE;
1566 }
1567
1568 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1569 MatchChunkAt((int32_t)nativeStart, FALSE, status);
1570 } else {
1571 MatchAt(nativeStart, FALSE, status);
1572 }
1573 return fMatch;
1574 }
1575
1576
1577
1578 //--------------------------------------------------------------------------------
1579 //
1580 // matches()
1581 //
1582 //--------------------------------------------------------------------------------
matches(UErrorCode & status)1583 UBool RegexMatcher::matches(UErrorCode &status) {
1584 if (U_FAILURE(status)) {
1585 return FALSE;
1586 }
1587 if (U_FAILURE(fDeferredStatus)) {
1588 status = fDeferredStatus;
1589 return FALSE;
1590 }
1591
1592 if (fInputUniStrMaybeMutable) {
1593 if (compat_SyncMutableUTextContents(fInputText)) {
1594 fInputLength = utext_nativeLength(fInputText);
1595 reset();
1596 }
1597 }
1598 else {
1599 resetPreserveRegion();
1600 }
1601
1602 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1603 MatchChunkAt((int32_t)fActiveStart, TRUE, status);
1604 } else {
1605 MatchAt(fActiveStart, TRUE, status);
1606 }
1607 return fMatch;
1608 }
1609
1610
matches(int64_t start,UErrorCode & status)1611 UBool RegexMatcher::matches(int64_t start, UErrorCode &status) {
1612 if (U_FAILURE(status)) {
1613 return FALSE;
1614 }
1615 if (U_FAILURE(fDeferredStatus)) {
1616 status = fDeferredStatus;
1617 return FALSE;
1618 }
1619 reset();
1620
1621 if (start < 0) {
1622 status = U_INDEX_OUTOFBOUNDS_ERROR;
1623 return FALSE;
1624 }
1625
1626 if (fInputUniStrMaybeMutable) {
1627 if (compat_SyncMutableUTextContents(fInputText)) {
1628 fInputLength = utext_nativeLength(fInputText);
1629 reset();
1630 }
1631 }
1632
1633 int64_t nativeStart;
1634 nativeStart = start;
1635 if (nativeStart < fActiveStart || nativeStart > fActiveLimit) {
1636 status = U_INDEX_OUTOFBOUNDS_ERROR;
1637 return FALSE;
1638 }
1639
1640 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1641 MatchChunkAt((int32_t)nativeStart, TRUE, status);
1642 } else {
1643 MatchAt(nativeStart, TRUE, status);
1644 }
1645 return fMatch;
1646 }
1647
1648
1649
1650 //--------------------------------------------------------------------------------
1651 //
1652 // pattern
1653 //
1654 //--------------------------------------------------------------------------------
pattern() const1655 const RegexPattern &RegexMatcher::pattern() const {
1656 return *fPattern;
1657 }
1658
1659
1660
1661 //--------------------------------------------------------------------------------
1662 //
1663 // region
1664 //
1665 //--------------------------------------------------------------------------------
region(int64_t regionStart,int64_t regionLimit,int64_t startIndex,UErrorCode & status)1666 RegexMatcher &RegexMatcher::region(int64_t regionStart, int64_t regionLimit, int64_t startIndex, UErrorCode &status) {
1667 if (U_FAILURE(status)) {
1668 return *this;
1669 }
1670
1671 if (regionStart>regionLimit || regionStart<0 || regionLimit<0) {
1672 status = U_ILLEGAL_ARGUMENT_ERROR;
1673 }
1674
1675 int64_t nativeStart = regionStart;
1676 int64_t nativeLimit = regionLimit;
1677 if (nativeStart > fInputLength || nativeLimit > fInputLength) {
1678 status = U_ILLEGAL_ARGUMENT_ERROR;
1679 }
1680
1681 if (startIndex == -1)
1682 this->reset();
1683 else
1684 resetPreserveRegion();
1685
1686 fRegionStart = nativeStart;
1687 fRegionLimit = nativeLimit;
1688 fActiveStart = nativeStart;
1689 fActiveLimit = nativeLimit;
1690
1691 if (startIndex != -1) {
1692 if (startIndex < fActiveStart || startIndex > fActiveLimit) {
1693 status = U_INDEX_OUTOFBOUNDS_ERROR;
1694 }
1695 fMatchEnd = startIndex;
1696 }
1697
1698 if (!fTransparentBounds) {
1699 fLookStart = nativeStart;
1700 fLookLimit = nativeLimit;
1701 }
1702 if (fAnchoringBounds) {
1703 fAnchorStart = nativeStart;
1704 fAnchorLimit = nativeLimit;
1705 }
1706 return *this;
1707 }
1708
region(int64_t start,int64_t limit,UErrorCode & status)1709 RegexMatcher &RegexMatcher::region(int64_t start, int64_t limit, UErrorCode &status) {
1710 return region(start, limit, -1, status);
1711 }
1712
1713 //--------------------------------------------------------------------------------
1714 //
1715 // regionEnd
1716 //
1717 //--------------------------------------------------------------------------------
regionEnd() const1718 int32_t RegexMatcher::regionEnd() const {
1719 return (int32_t)fRegionLimit;
1720 }
1721
regionEnd64() const1722 int64_t RegexMatcher::regionEnd64() const {
1723 return fRegionLimit;
1724 }
1725
1726 //--------------------------------------------------------------------------------
1727 //
1728 // regionStart
1729 //
1730 //--------------------------------------------------------------------------------
regionStart() const1731 int32_t RegexMatcher::regionStart() const {
1732 return (int32_t)fRegionStart;
1733 }
1734
regionStart64() const1735 int64_t RegexMatcher::regionStart64() const {
1736 return fRegionStart;
1737 }
1738
1739
1740 //--------------------------------------------------------------------------------
1741 //
1742 // replaceAll
1743 //
1744 //--------------------------------------------------------------------------------
replaceAll(const UnicodeString & replacement,UErrorCode & status)1745 UnicodeString RegexMatcher::replaceAll(const UnicodeString &replacement, UErrorCode &status) {
1746 UText replacementText = UTEXT_INITIALIZER;
1747 UText resultText = UTEXT_INITIALIZER;
1748 UnicodeString resultString;
1749 if (U_FAILURE(status)) {
1750 return resultString;
1751 }
1752
1753 utext_openConstUnicodeString(&replacementText, &replacement, &status);
1754 utext_openUnicodeString(&resultText, &resultString, &status);
1755
1756 replaceAll(&replacementText, &resultText, status);
1757
1758 utext_close(&resultText);
1759 utext_close(&replacementText);
1760
1761 return resultString;
1762 }
1763
1764
1765 //
1766 // replaceAll, UText mode
1767 //
replaceAll(UText * replacement,UText * dest,UErrorCode & status)1768 UText *RegexMatcher::replaceAll(UText *replacement, UText *dest, UErrorCode &status) {
1769 if (U_FAILURE(status)) {
1770 return dest;
1771 }
1772 if (U_FAILURE(fDeferredStatus)) {
1773 status = fDeferredStatus;
1774 return dest;
1775 }
1776
1777 if (dest == NULL) {
1778 UnicodeString emptyString;
1779 UText empty = UTEXT_INITIALIZER;
1780
1781 utext_openUnicodeString(&empty, &emptyString, &status);
1782 dest = utext_clone(NULL, &empty, TRUE, FALSE, &status);
1783 utext_close(&empty);
1784 }
1785
1786 if (U_SUCCESS(status)) {
1787 reset();
1788 while (find()) {
1789 appendReplacement(dest, replacement, status);
1790 if (U_FAILURE(status)) {
1791 break;
1792 }
1793 }
1794 appendTail(dest, status);
1795 }
1796
1797 return dest;
1798 }
1799
1800
1801 //--------------------------------------------------------------------------------
1802 //
1803 // replaceFirst
1804 //
1805 //--------------------------------------------------------------------------------
replaceFirst(const UnicodeString & replacement,UErrorCode & status)1806 UnicodeString RegexMatcher::replaceFirst(const UnicodeString &replacement, UErrorCode &status) {
1807 UText replacementText = UTEXT_INITIALIZER;
1808 UText resultText = UTEXT_INITIALIZER;
1809 UnicodeString resultString;
1810
1811 utext_openConstUnicodeString(&replacementText, &replacement, &status);
1812 utext_openUnicodeString(&resultText, &resultString, &status);
1813
1814 replaceFirst(&replacementText, &resultText, status);
1815
1816 utext_close(&resultText);
1817 utext_close(&replacementText);
1818
1819 return resultString;
1820 }
1821
1822 //
1823 // replaceFirst, UText mode
1824 //
replaceFirst(UText * replacement,UText * dest,UErrorCode & status)1825 UText *RegexMatcher::replaceFirst(UText *replacement, UText *dest, UErrorCode &status) {
1826 if (U_FAILURE(status)) {
1827 return dest;
1828 }
1829 if (U_FAILURE(fDeferredStatus)) {
1830 status = fDeferredStatus;
1831 return dest;
1832 }
1833
1834 reset();
1835 if (!find()) {
1836 return getInput(dest, status);
1837 }
1838
1839 if (dest == NULL) {
1840 UnicodeString emptyString;
1841 UText empty = UTEXT_INITIALIZER;
1842
1843 utext_openUnicodeString(&empty, &emptyString, &status);
1844 dest = utext_clone(NULL, &empty, TRUE, FALSE, &status);
1845 utext_close(&empty);
1846 }
1847
1848 appendReplacement(dest, replacement, status);
1849 appendTail(dest, status);
1850
1851 return dest;
1852 }
1853
1854
1855 //--------------------------------------------------------------------------------
1856 //
1857 // requireEnd
1858 //
1859 //--------------------------------------------------------------------------------
requireEnd() const1860 UBool RegexMatcher::requireEnd() const {
1861 return fRequireEnd;
1862 }
1863
1864
1865 //--------------------------------------------------------------------------------
1866 //
1867 // reset
1868 //
1869 //--------------------------------------------------------------------------------
reset()1870 RegexMatcher &RegexMatcher::reset() {
1871 fRegionStart = 0;
1872 fRegionLimit = fInputLength;
1873 fActiveStart = 0;
1874 fActiveLimit = fInputLength;
1875 fAnchorStart = 0;
1876 fAnchorLimit = fInputLength;
1877 fLookStart = 0;
1878 fLookLimit = fInputLength;
1879 resetPreserveRegion();
1880 return *this;
1881 }
1882
1883
1884
resetPreserveRegion()1885 void RegexMatcher::resetPreserveRegion() {
1886 fMatchStart = 0;
1887 fMatchEnd = 0;
1888 fLastMatchEnd = -1;
1889 fAppendPosition = 0;
1890 fMatch = FALSE;
1891 fHitEnd = FALSE;
1892 fRequireEnd = FALSE;
1893 fTime = 0;
1894 fTickCounter = TIMER_INITIAL_VALUE;
1895 //resetStack(); // more expensive than it looks...
1896 }
1897
1898
reset(const UnicodeString & input)1899 RegexMatcher &RegexMatcher::reset(const UnicodeString &input) {
1900 fInputText = utext_openConstUnicodeString(fInputText, &input, &fDeferredStatus);
1901 if (fPattern->fNeedsAltInput) {
1902 fAltInputText = utext_clone(fAltInputText, fInputText, FALSE, TRUE, &fDeferredStatus);
1903 }
1904 fInputLength = utext_nativeLength(fInputText);
1905
1906 reset();
1907 delete fInput;
1908 fInput = NULL;
1909
1910 // Do the following for any UnicodeString.
1911 // This is for compatibility for those clients who modify the input string "live" during regex operations.
1912 fInputUniStrMaybeMutable = TRUE;
1913
1914 if (fWordBreakItr != NULL) {
1915 #if UCONFIG_NO_BREAK_ITERATION==0
1916 UErrorCode status = U_ZERO_ERROR;
1917 fWordBreakItr->setText(fInputText, status);
1918 #endif
1919 }
1920 return *this;
1921 }
1922
1923
reset(UText * input)1924 RegexMatcher &RegexMatcher::reset(UText *input) {
1925 if (fInputText != input) {
1926 fInputText = utext_clone(fInputText, input, FALSE, TRUE, &fDeferredStatus);
1927 if (fPattern->fNeedsAltInput) fAltInputText = utext_clone(fAltInputText, fInputText, FALSE, TRUE, &fDeferredStatus);
1928 fInputLength = utext_nativeLength(fInputText);
1929
1930 delete fInput;
1931 fInput = NULL;
1932
1933 if (fWordBreakItr != NULL) {
1934 #if UCONFIG_NO_BREAK_ITERATION==0
1935 UErrorCode status = U_ZERO_ERROR;
1936 fWordBreakItr->setText(input, status);
1937 #endif
1938 }
1939 }
1940 reset();
1941 fInputUniStrMaybeMutable = FALSE;
1942
1943 return *this;
1944 }
1945
1946 /*RegexMatcher &RegexMatcher::reset(const UChar *) {
1947 fDeferredStatus = U_INTERNAL_PROGRAM_ERROR;
1948 return *this;
1949 }*/
1950
reset(int64_t position,UErrorCode & status)1951 RegexMatcher &RegexMatcher::reset(int64_t position, UErrorCode &status) {
1952 if (U_FAILURE(status)) {
1953 return *this;
1954 }
1955 reset(); // Reset also resets the region to be the entire string.
1956
1957 if (position < 0 || position > fActiveLimit) {
1958 status = U_INDEX_OUTOFBOUNDS_ERROR;
1959 return *this;
1960 }
1961 fMatchEnd = position;
1962 return *this;
1963 }
1964
1965
1966 //--------------------------------------------------------------------------------
1967 //
1968 // refresh
1969 //
1970 //--------------------------------------------------------------------------------
refreshInputText(UText * input,UErrorCode & status)1971 RegexMatcher &RegexMatcher::refreshInputText(UText *input, UErrorCode &status) {
1972 if (U_FAILURE(status)) {
1973 return *this;
1974 }
1975 if (input == NULL) {
1976 status = U_ILLEGAL_ARGUMENT_ERROR;
1977 return *this;
1978 }
1979 if (utext_nativeLength(fInputText) != utext_nativeLength(input)) {
1980 status = U_ILLEGAL_ARGUMENT_ERROR;
1981 return *this;
1982 }
1983 int64_t pos = utext_getNativeIndex(fInputText);
1984 // Shallow read-only clone of the new UText into the existing input UText
1985 fInputText = utext_clone(fInputText, input, FALSE, TRUE, &status);
1986 if (U_FAILURE(status)) {
1987 return *this;
1988 }
1989 utext_setNativeIndex(fInputText, pos);
1990
1991 if (fAltInputText != NULL) {
1992 pos = utext_getNativeIndex(fAltInputText);
1993 fAltInputText = utext_clone(fAltInputText, input, FALSE, TRUE, &status);
1994 if (U_FAILURE(status)) {
1995 return *this;
1996 }
1997 utext_setNativeIndex(fAltInputText, pos);
1998 }
1999 return *this;
2000 }
2001
2002
2003
2004 //--------------------------------------------------------------------------------
2005 //
2006 // setTrace
2007 //
2008 //--------------------------------------------------------------------------------
setTrace(UBool state)2009 void RegexMatcher::setTrace(UBool state) {
2010 fTraceDebug = state;
2011 }
2012
2013
2014
2015 //---------------------------------------------------------------------
2016 //
2017 // split
2018 //
2019 //---------------------------------------------------------------------
split(const UnicodeString & input,UnicodeString dest[],int32_t destCapacity,UErrorCode & status)2020 int32_t RegexMatcher::split(const UnicodeString &input,
2021 UnicodeString dest[],
2022 int32_t destCapacity,
2023 UErrorCode &status)
2024 {
2025 UText inputText = UTEXT_INITIALIZER;
2026 utext_openConstUnicodeString(&inputText, &input, &status);
2027 if (U_FAILURE(status)) {
2028 return 0;
2029 }
2030
2031 UText **destText = (UText **)uprv_malloc(sizeof(UText*)*destCapacity);
2032 if (destText == NULL) {
2033 status = U_MEMORY_ALLOCATION_ERROR;
2034 return 0;
2035 }
2036 int32_t i;
2037 for (i = 0; i < destCapacity; i++) {
2038 destText[i] = utext_openUnicodeString(NULL, &dest[i], &status);
2039 }
2040
2041 int32_t fieldCount = split(&inputText, destText, destCapacity, status);
2042
2043 for (i = 0; i < destCapacity; i++) {
2044 utext_close(destText[i]);
2045 }
2046
2047 uprv_free(destText);
2048 utext_close(&inputText);
2049 return fieldCount;
2050 }
2051
2052 //
2053 // split, UText mode
2054 //
split(UText * input,UText * dest[],int32_t destCapacity,UErrorCode & status)2055 int32_t RegexMatcher::split(UText *input,
2056 UText *dest[],
2057 int32_t destCapacity,
2058 UErrorCode &status)
2059 {
2060 //
2061 // Check arguements for validity
2062 //
2063 if (U_FAILURE(status)) {
2064 return 0;
2065 };
2066
2067 if (destCapacity < 1) {
2068 status = U_ILLEGAL_ARGUMENT_ERROR;
2069 return 0;
2070 }
2071
2072 //
2073 // Reset for the input text
2074 //
2075 reset(input);
2076 int64_t nextOutputStringStart = 0;
2077 if (fActiveLimit == 0) {
2078 return 0;
2079 }
2080
2081 //
2082 // Loop through the input text, searching for the delimiter pattern
2083 //
2084 int32_t i;
2085 int32_t numCaptureGroups = fPattern->fGroupMap->size();
2086 for (i=0; ; i++) {
2087 if (i>=destCapacity-1) {
2088 // There is one or zero output string left.
2089 // Fill the last output string with whatever is left from the input, then exit the loop.
2090 // ( i will be == destCapacity if we filled the output array while processing
2091 // capture groups of the delimiter expression, in which case we will discard the
2092 // last capture group saved in favor of the unprocessed remainder of the
2093 // input string.)
2094 i = destCapacity-1;
2095 if (fActiveLimit > nextOutputStringStart) {
2096 if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) {
2097 if (dest[i]) {
2098 utext_replace(dest[i], 0, utext_nativeLength(dest[i]),
2099 input->chunkContents+nextOutputStringStart,
2100 (int32_t)(fActiveLimit-nextOutputStringStart), &status);
2101 } else {
2102 UText remainingText = UTEXT_INITIALIZER;
2103 utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart,
2104 fActiveLimit-nextOutputStringStart, &status);
2105 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2106 utext_close(&remainingText);
2107 }
2108 } else {
2109 UErrorCode lengthStatus = U_ZERO_ERROR;
2110 int32_t remaining16Length =
2111 utext_extract(input, nextOutputStringStart, fActiveLimit, NULL, 0, &lengthStatus);
2112 UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1));
2113 if (remainingChars == NULL) {
2114 status = U_MEMORY_ALLOCATION_ERROR;
2115 break;
2116 }
2117
2118 utext_extract(input, nextOutputStringStart, fActiveLimit, remainingChars, remaining16Length+1, &status);
2119 if (dest[i]) {
2120 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status);
2121 } else {
2122 UText remainingText = UTEXT_INITIALIZER;
2123 utext_openUChars(&remainingText, remainingChars, remaining16Length, &status);
2124 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2125 utext_close(&remainingText);
2126 }
2127
2128 uprv_free(remainingChars);
2129 }
2130 }
2131 break;
2132 }
2133 if (find()) {
2134 // We found another delimiter. Move everything from where we started looking
2135 // up until the start of the delimiter into the next output string.
2136 if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) {
2137 if (dest[i]) {
2138 utext_replace(dest[i], 0, utext_nativeLength(dest[i]),
2139 input->chunkContents+nextOutputStringStart,
2140 (int32_t)(fMatchStart-nextOutputStringStart), &status);
2141 } else {
2142 UText remainingText = UTEXT_INITIALIZER;
2143 utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart,
2144 fMatchStart-nextOutputStringStart, &status);
2145 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2146 utext_close(&remainingText);
2147 }
2148 } else {
2149 UErrorCode lengthStatus = U_ZERO_ERROR;
2150 int32_t remaining16Length = utext_extract(input, nextOutputStringStart, fMatchStart, NULL, 0, &lengthStatus);
2151 UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1));
2152 if (remainingChars == NULL) {
2153 status = U_MEMORY_ALLOCATION_ERROR;
2154 break;
2155 }
2156 utext_extract(input, nextOutputStringStart, fMatchStart, remainingChars, remaining16Length+1, &status);
2157 if (dest[i]) {
2158 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status);
2159 } else {
2160 UText remainingText = UTEXT_INITIALIZER;
2161 utext_openUChars(&remainingText, remainingChars, remaining16Length, &status);
2162 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2163 utext_close(&remainingText);
2164 }
2165
2166 uprv_free(remainingChars);
2167 }
2168 nextOutputStringStart = fMatchEnd;
2169
2170 // If the delimiter pattern has capturing parentheses, the captured
2171 // text goes out into the next n destination strings.
2172 int32_t groupNum;
2173 for (groupNum=1; groupNum<=numCaptureGroups; groupNum++) {
2174 if (i >= destCapacity-2) {
2175 // Never fill the last available output string with capture group text.
2176 // It will filled with the last field, the remainder of the
2177 // unsplit input text.
2178 break;
2179 }
2180 i++;
2181 dest[i] = group(groupNum, dest[i], status);
2182 }
2183
2184 if (nextOutputStringStart == fActiveLimit) {
2185 // The delimiter was at the end of the string. We're done, but first
2186 // we output one last empty string, for the empty field following
2187 // the delimiter at the end of input.
2188 if (i+1 < destCapacity) {
2189 ++i;
2190 if (dest[i] == NULL) {
2191 dest[i] = utext_openUChars(NULL, NULL, 0, &status);
2192 } else {
2193 static UChar emptyString[] = {(UChar)0};
2194 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), emptyString, 0, &status);
2195 }
2196 }
2197 break;
2198
2199 }
2200 }
2201 else
2202 {
2203 // We ran off the end of the input while looking for the next delimiter.
2204 // All the remaining text goes into the current output string.
2205 if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) {
2206 if (dest[i]) {
2207 utext_replace(dest[i], 0, utext_nativeLength(dest[i]),
2208 input->chunkContents+nextOutputStringStart,
2209 (int32_t)(fActiveLimit-nextOutputStringStart), &status);
2210 } else {
2211 UText remainingText = UTEXT_INITIALIZER;
2212 utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart,
2213 fActiveLimit-nextOutputStringStart, &status);
2214 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2215 utext_close(&remainingText);
2216 }
2217 } else {
2218 UErrorCode lengthStatus = U_ZERO_ERROR;
2219 int32_t remaining16Length = utext_extract(input, nextOutputStringStart, fActiveLimit, NULL, 0, &lengthStatus);
2220 UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1));
2221 if (remainingChars == NULL) {
2222 status = U_MEMORY_ALLOCATION_ERROR;
2223 break;
2224 }
2225
2226 utext_extract(input, nextOutputStringStart, fActiveLimit, remainingChars, remaining16Length+1, &status);
2227 if (dest[i]) {
2228 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status);
2229 } else {
2230 UText remainingText = UTEXT_INITIALIZER;
2231 utext_openUChars(&remainingText, remainingChars, remaining16Length, &status);
2232 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2233 utext_close(&remainingText);
2234 }
2235
2236 uprv_free(remainingChars);
2237 }
2238 break;
2239 }
2240 if (U_FAILURE(status)) {
2241 break;
2242 }
2243 } // end of for loop
2244 return i+1;
2245 }
2246
2247
2248 //--------------------------------------------------------------------------------
2249 //
2250 // start
2251 //
2252 //--------------------------------------------------------------------------------
start(UErrorCode & status) const2253 int32_t RegexMatcher::start(UErrorCode &status) const {
2254 return start(0, status);
2255 }
2256
start64(UErrorCode & status) const2257 int64_t RegexMatcher::start64(UErrorCode &status) const {
2258 return start64(0, status);
2259 }
2260
2261 //--------------------------------------------------------------------------------
2262 //
2263 // start(int32_t group, UErrorCode &status)
2264 //
2265 //--------------------------------------------------------------------------------
2266
start64(int32_t group,UErrorCode & status) const2267 int64_t RegexMatcher::start64(int32_t group, UErrorCode &status) const {
2268 if (U_FAILURE(status)) {
2269 return -1;
2270 }
2271 if (U_FAILURE(fDeferredStatus)) {
2272 status = fDeferredStatus;
2273 return -1;
2274 }
2275 if (fMatch == FALSE) {
2276 status = U_REGEX_INVALID_STATE;
2277 return -1;
2278 }
2279 if (group < 0 || group > fPattern->fGroupMap->size()) {
2280 status = U_INDEX_OUTOFBOUNDS_ERROR;
2281 return -1;
2282 }
2283 int64_t s;
2284 if (group == 0) {
2285 s = fMatchStart;
2286 } else {
2287 int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1);
2288 U_ASSERT(groupOffset < fPattern->fFrameSize);
2289 U_ASSERT(groupOffset >= 0);
2290 s = fFrame->fExtra[groupOffset];
2291 }
2292
2293 return s;
2294 }
2295
2296
start(int32_t group,UErrorCode & status) const2297 int32_t RegexMatcher::start(int32_t group, UErrorCode &status) const {
2298 return (int32_t)start64(group, status);
2299 }
2300
2301 //--------------------------------------------------------------------------------
2302 //
2303 // useAnchoringBounds
2304 //
2305 //--------------------------------------------------------------------------------
useAnchoringBounds(UBool b)2306 RegexMatcher &RegexMatcher::useAnchoringBounds(UBool b) {
2307 fAnchoringBounds = b;
2308 fAnchorStart = (fAnchoringBounds ? fRegionStart : 0);
2309 fAnchorLimit = (fAnchoringBounds ? fRegionLimit : fInputLength);
2310 return *this;
2311 }
2312
2313
2314 //--------------------------------------------------------------------------------
2315 //
2316 // useTransparentBounds
2317 //
2318 //--------------------------------------------------------------------------------
useTransparentBounds(UBool b)2319 RegexMatcher &RegexMatcher::useTransparentBounds(UBool b) {
2320 fTransparentBounds = b;
2321 fLookStart = (fTransparentBounds ? 0 : fRegionStart);
2322 fLookLimit = (fTransparentBounds ? fInputLength : fRegionLimit);
2323 return *this;
2324 }
2325
2326 //--------------------------------------------------------------------------------
2327 //
2328 // setTimeLimit
2329 //
2330 //--------------------------------------------------------------------------------
setTimeLimit(int32_t limit,UErrorCode & status)2331 void RegexMatcher::setTimeLimit(int32_t limit, UErrorCode &status) {
2332 if (U_FAILURE(status)) {
2333 return;
2334 }
2335 if (U_FAILURE(fDeferredStatus)) {
2336 status = fDeferredStatus;
2337 return;
2338 }
2339 if (limit < 0) {
2340 status = U_ILLEGAL_ARGUMENT_ERROR;
2341 return;
2342 }
2343 fTimeLimit = limit;
2344 }
2345
2346
2347 //--------------------------------------------------------------------------------
2348 //
2349 // getTimeLimit
2350 //
2351 //--------------------------------------------------------------------------------
getTimeLimit() const2352 int32_t RegexMatcher::getTimeLimit() const {
2353 return fTimeLimit;
2354 }
2355
2356
2357 //--------------------------------------------------------------------------------
2358 //
2359 // setStackLimit
2360 //
2361 //--------------------------------------------------------------------------------
setStackLimit(int32_t limit,UErrorCode & status)2362 void RegexMatcher::setStackLimit(int32_t limit, UErrorCode &status) {
2363 if (U_FAILURE(status)) {
2364 return;
2365 }
2366 if (U_FAILURE(fDeferredStatus)) {
2367 status = fDeferredStatus;
2368 return;
2369 }
2370 if (limit < 0) {
2371 status = U_ILLEGAL_ARGUMENT_ERROR;
2372 return;
2373 }
2374
2375 // Reset the matcher. This is needed here in case there is a current match
2376 // whose final stack frame (containing the match results, pointed to by fFrame)
2377 // would be lost by resizing to a smaller stack size.
2378 reset();
2379
2380 if (limit == 0) {
2381 // Unlimited stack expansion
2382 fStack->setMaxCapacity(0);
2383 } else {
2384 // Change the units of the limit from bytes to ints, and bump the size up
2385 // to be big enough to hold at least one stack frame for the pattern,
2386 // if it isn't there already.
2387 int32_t adjustedLimit = limit / sizeof(int32_t);
2388 if (adjustedLimit < fPattern->fFrameSize) {
2389 adjustedLimit = fPattern->fFrameSize;
2390 }
2391 fStack->setMaxCapacity(adjustedLimit);
2392 }
2393 fStackLimit = limit;
2394 }
2395
2396
2397 //--------------------------------------------------------------------------------
2398 //
2399 // getStackLimit
2400 //
2401 //--------------------------------------------------------------------------------
getStackLimit() const2402 int32_t RegexMatcher::getStackLimit() const {
2403 return fStackLimit;
2404 }
2405
2406
2407 //--------------------------------------------------------------------------------
2408 //
2409 // setMatchCallback
2410 //
2411 //--------------------------------------------------------------------------------
setMatchCallback(URegexMatchCallback * callback,const void * context,UErrorCode & status)2412 void RegexMatcher::setMatchCallback(URegexMatchCallback *callback,
2413 const void *context,
2414 UErrorCode &status) {
2415 if (U_FAILURE(status)) {
2416 return;
2417 }
2418 fCallbackFn = callback;
2419 fCallbackContext = context;
2420 }
2421
2422
2423 //--------------------------------------------------------------------------------
2424 //
2425 // getMatchCallback
2426 //
2427 //--------------------------------------------------------------------------------
getMatchCallback(URegexMatchCallback * & callback,const void * & context,UErrorCode & status)2428 void RegexMatcher::getMatchCallback(URegexMatchCallback *&callback,
2429 const void *&context,
2430 UErrorCode &status) {
2431 if (U_FAILURE(status)) {
2432 return;
2433 }
2434 callback = fCallbackFn;
2435 context = fCallbackContext;
2436 }
2437
2438
2439 //--------------------------------------------------------------------------------
2440 //
2441 // setMatchCallback
2442 //
2443 //--------------------------------------------------------------------------------
setFindProgressCallback(URegexFindProgressCallback * callback,const void * context,UErrorCode & status)2444 void RegexMatcher::setFindProgressCallback(URegexFindProgressCallback *callback,
2445 const void *context,
2446 UErrorCode &status) {
2447 if (U_FAILURE(status)) {
2448 return;
2449 }
2450 fFindProgressCallbackFn = callback;
2451 fFindProgressCallbackContext = context;
2452 }
2453
2454
2455 //--------------------------------------------------------------------------------
2456 //
2457 // getMatchCallback
2458 //
2459 //--------------------------------------------------------------------------------
getFindProgressCallback(URegexFindProgressCallback * & callback,const void * & context,UErrorCode & status)2460 void RegexMatcher::getFindProgressCallback(URegexFindProgressCallback *&callback,
2461 const void *&context,
2462 UErrorCode &status) {
2463 if (U_FAILURE(status)) {
2464 return;
2465 }
2466 callback = fFindProgressCallbackFn;
2467 context = fFindProgressCallbackContext;
2468 }
2469
2470
2471 //================================================================================
2472 //
2473 // Code following this point in this file is the internal
2474 // Match Engine Implementation.
2475 //
2476 //================================================================================
2477
2478
2479 //--------------------------------------------------------------------------------
2480 //
2481 // resetStack
2482 // Discard any previous contents of the state save stack, and initialize a
2483 // new stack frame to all -1. The -1s are needed for capture group limits,
2484 // where they indicate that a group has not yet matched anything.
2485 //--------------------------------------------------------------------------------
resetStack()2486 REStackFrame *RegexMatcher::resetStack() {
2487 // Discard any previous contents of the state save stack, and initialize a
2488 // new stack frame with all -1 data. The -1s are needed for capture group limits,
2489 // where they indicate that a group has not yet matched anything.
2490 fStack->removeAllElements();
2491
2492 REStackFrame *iFrame = (REStackFrame *)fStack->reserveBlock(fPattern->fFrameSize, fDeferredStatus);
2493 int32_t i;
2494 for (i=0; i<fPattern->fFrameSize-RESTACKFRAME_HDRCOUNT; i++) {
2495 iFrame->fExtra[i] = -1;
2496 }
2497 return iFrame;
2498 }
2499
2500
2501
2502 //--------------------------------------------------------------------------------
2503 //
2504 // isWordBoundary
2505 // in perl, "xab..cd..", \b is true at positions 0,3,5,7
2506 // For us,
2507 // If the current char is a combining mark,
2508 // \b is FALSE.
2509 // Else Scan backwards to the first non-combining char.
2510 // We are at a boundary if the this char and the original chars are
2511 // opposite in membership in \w set
2512 //
2513 // parameters: pos - the current position in the input buffer
2514 //
2515 // TODO: double-check edge cases at region boundaries.
2516 //
2517 //--------------------------------------------------------------------------------
isWordBoundary(int64_t pos)2518 UBool RegexMatcher::isWordBoundary(int64_t pos) {
2519 UBool isBoundary = FALSE;
2520 UBool cIsWord = FALSE;
2521
2522 if (pos >= fLookLimit) {
2523 fHitEnd = TRUE;
2524 } else {
2525 // Determine whether char c at current position is a member of the word set of chars.
2526 // If we're off the end of the string, behave as though we're not at a word char.
2527 UTEXT_SETNATIVEINDEX(fInputText, pos);
2528 UChar32 c = UTEXT_CURRENT32(fInputText);
2529 if (u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND) || u_charType(c) == U_FORMAT_CHAR) {
2530 // Current char is a combining one. Not a boundary.
2531 return FALSE;
2532 }
2533 cIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(c);
2534 }
2535
2536 // Back up until we come to a non-combining char, determine whether
2537 // that char is a word char.
2538 UBool prevCIsWord = FALSE;
2539 for (;;) {
2540 if (UTEXT_GETNATIVEINDEX(fInputText) <= fLookStart) {
2541 break;
2542 }
2543 UChar32 prevChar = UTEXT_PREVIOUS32(fInputText);
2544 if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND)
2545 || u_charType(prevChar) == U_FORMAT_CHAR)) {
2546 prevCIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(prevChar);
2547 break;
2548 }
2549 }
2550 isBoundary = cIsWord ^ prevCIsWord;
2551 return isBoundary;
2552 }
2553
isChunkWordBoundary(int32_t pos)2554 UBool RegexMatcher::isChunkWordBoundary(int32_t pos) {
2555 UBool isBoundary = FALSE;
2556 UBool cIsWord = FALSE;
2557
2558 const UChar *inputBuf = fInputText->chunkContents;
2559
2560 if (pos >= fLookLimit) {
2561 fHitEnd = TRUE;
2562 } else {
2563 // Determine whether char c at current position is a member of the word set of chars.
2564 // If we're off the end of the string, behave as though we're not at a word char.
2565 UChar32 c;
2566 U16_GET(inputBuf, fLookStart, pos, fLookLimit, c);
2567 if (u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND) || u_charType(c) == U_FORMAT_CHAR) {
2568 // Current char is a combining one. Not a boundary.
2569 return FALSE;
2570 }
2571 cIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(c);
2572 }
2573
2574 // Back up until we come to a non-combining char, determine whether
2575 // that char is a word char.
2576 UBool prevCIsWord = FALSE;
2577 for (;;) {
2578 if (pos <= fLookStart) {
2579 break;
2580 }
2581 UChar32 prevChar;
2582 U16_PREV(inputBuf, fLookStart, pos, prevChar);
2583 if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND)
2584 || u_charType(prevChar) == U_FORMAT_CHAR)) {
2585 prevCIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(prevChar);
2586 break;
2587 }
2588 }
2589 isBoundary = cIsWord ^ prevCIsWord;
2590 return isBoundary;
2591 }
2592
2593 //--------------------------------------------------------------------------------
2594 //
2595 // isUWordBoundary
2596 //
2597 // Test for a word boundary using RBBI word break.
2598 //
2599 // parameters: pos - the current position in the input buffer
2600 //
2601 //--------------------------------------------------------------------------------
isUWordBoundary(int64_t pos)2602 UBool RegexMatcher::isUWordBoundary(int64_t pos) {
2603 UBool returnVal = FALSE;
2604 #if UCONFIG_NO_BREAK_ITERATION==0
2605
2606 // If we haven't yet created a break iterator for this matcher, do it now.
2607 if (fWordBreakItr == NULL) {
2608 fWordBreakItr =
2609 (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), fDeferredStatus);
2610 if (U_FAILURE(fDeferredStatus)) {
2611 return FALSE;
2612 }
2613 fWordBreakItr->setText(fInputText, fDeferredStatus);
2614 }
2615
2616 if (pos >= fLookLimit) {
2617 fHitEnd = TRUE;
2618 returnVal = TRUE; // With Unicode word rules, only positions within the interior of "real"
2619 // words are not boundaries. All non-word chars stand by themselves,
2620 // with word boundaries on both sides.
2621 } else {
2622 if (!UTEXT_USES_U16(fInputText)) {
2623 // !!!: Would like a better way to do this!
2624 UErrorCode status = U_ZERO_ERROR;
2625 pos = utext_extract(fInputText, 0, pos, NULL, 0, &status);
2626 }
2627 returnVal = fWordBreakItr->isBoundary((int32_t)pos);
2628 }
2629 #endif
2630 return returnVal;
2631 }
2632
2633 //--------------------------------------------------------------------------------
2634 //
2635 // IncrementTime This function is called once each TIMER_INITIAL_VALUE state
2636 // saves. Increment the "time" counter, and call the
2637 // user callback function if there is one installed.
2638 //
2639 // If the match operation needs to be aborted, either for a time-out
2640 // or because the user callback asked for it, just set an error status.
2641 // The engine will pick that up and stop in its outer loop.
2642 //
2643 //--------------------------------------------------------------------------------
IncrementTime(UErrorCode & status)2644 void RegexMatcher::IncrementTime(UErrorCode &status) {
2645 fTickCounter = TIMER_INITIAL_VALUE;
2646 fTime++;
2647 if (fCallbackFn != NULL) {
2648 if ((*fCallbackFn)(fCallbackContext, fTime) == FALSE) {
2649 status = U_REGEX_STOPPED_BY_CALLER;
2650 return;
2651 }
2652 }
2653 if (fTimeLimit > 0 && fTime >= fTimeLimit) {
2654 status = U_REGEX_TIME_OUT;
2655 }
2656 }
2657
2658 //--------------------------------------------------------------------------------
2659 //
2660 // ReportFindProgress This function is called once for each advance in the target
2661 // string from the find() function, and calls the user progress callback
2662 // function if there is one installed.
2663 //
2664 // NOTE:
2665 //
2666 // If the match operation needs to be aborted because the user
2667 // callback asked for it, just set an error status.
2668 // The engine will pick that up and stop in its outer loop.
2669 //
2670 //--------------------------------------------------------------------------------
ReportFindProgress(int64_t matchIndex,UErrorCode & status)2671 UBool RegexMatcher::ReportFindProgress(int64_t matchIndex, UErrorCode &status) {
2672 if (fFindProgressCallbackFn != NULL) {
2673 if ((*fFindProgressCallbackFn)(fFindProgressCallbackContext, matchIndex) == FALSE) {
2674 status = U_ZERO_ERROR /*U_REGEX_STOPPED_BY_CALLER*/;
2675 return FALSE;
2676 }
2677 }
2678 return TRUE;
2679 }
2680
2681 //--------------------------------------------------------------------------------
2682 //
2683 // StateSave
2684 // Make a new stack frame, initialized as a copy of the current stack frame.
2685 // Set the pattern index in the original stack frame from the operand value
2686 // in the opcode. Execution of the engine continues with the state in
2687 // the newly created stack frame
2688 //
2689 // Note that reserveBlock() may grow the stack, resulting in the
2690 // whole thing being relocated in memory.
2691 //
2692 // Parameters:
2693 // fp The top frame pointer when called. At return, a new
2694 // fame will be present
2695 // savePatIdx An index into the compiled pattern. Goes into the original
2696 // (not new) frame. If execution ever back-tracks out of the
2697 // new frame, this will be where we continue from in the pattern.
2698 // Return
2699 // The new frame pointer.
2700 //
2701 //--------------------------------------------------------------------------------
StateSave(REStackFrame * fp,int64_t savePatIdx,UErrorCode & status)2702 inline REStackFrame *RegexMatcher::StateSave(REStackFrame *fp, int64_t savePatIdx, UErrorCode &status) {
2703 // push storage for a new frame.
2704 int64_t *newFP = fStack->reserveBlock(fFrameSize, status);
2705 if (newFP == NULL) {
2706 // Failure on attempted stack expansion.
2707 // Stack function set some other error code, change it to a more
2708 // specific one for regular expressions.
2709 status = U_REGEX_STACK_OVERFLOW;
2710 // We need to return a writable stack frame, so just return the
2711 // previous frame. The match operation will stop quickly
2712 // because of the error status, after which the frame will never
2713 // be looked at again.
2714 return fp;
2715 }
2716 fp = (REStackFrame *)(newFP - fFrameSize); // in case of realloc of stack.
2717
2718 // New stack frame = copy of old top frame.
2719 int64_t *source = (int64_t *)fp;
2720 int64_t *dest = newFP;
2721 for (;;) {
2722 *dest++ = *source++;
2723 if (source == newFP) {
2724 break;
2725 }
2726 }
2727
2728 fTickCounter--;
2729 if (fTickCounter <= 0) {
2730 IncrementTime(status); // Re-initializes fTickCounter
2731 }
2732 fp->fPatIdx = savePatIdx;
2733 return (REStackFrame *)newFP;
2734 }
2735
2736
2737 //--------------------------------------------------------------------------------
2738 //
2739 // MatchAt This is the actual matching engine.
2740 //
2741 // startIdx: begin matching a this index.
2742 // toEnd: if true, match must extend to end of the input region
2743 //
2744 //--------------------------------------------------------------------------------
MatchAt(int64_t startIdx,UBool toEnd,UErrorCode & status)2745 void RegexMatcher::MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status) {
2746 UBool isMatch = FALSE; // True if the we have a match.
2747
2748 int64_t backSearchIndex = U_INT64_MAX; // used after greedy single-character matches for searching backwards
2749
2750 int32_t op; // Operation from the compiled pattern, split into
2751 int32_t opType; // the opcode
2752 int32_t opValue; // and the operand value.
2753
2754 #ifdef REGEX_RUN_DEBUG
2755 if (fTraceDebug)
2756 {
2757 printf("MatchAt(startIdx=%ld)\n", startIdx);
2758 printf("Original Pattern: ");
2759 UChar32 c = utext_next32From(fPattern->fPattern, 0);
2760 while (c != U_SENTINEL) {
2761 if (c<32 || c>256) {
2762 c = '.';
2763 }
2764 REGEX_DUMP_DEBUG_PRINTF(("%c", c));
2765
2766 c = UTEXT_NEXT32(fPattern->fPattern);
2767 }
2768 printf("\n");
2769 printf("Input String: ");
2770 c = utext_next32From(fInputText, 0);
2771 while (c != U_SENTINEL) {
2772 if (c<32 || c>256) {
2773 c = '.';
2774 }
2775 printf("%c", c);
2776
2777 c = UTEXT_NEXT32(fInputText);
2778 }
2779 printf("\n");
2780 printf("\n");
2781 }
2782 #endif
2783
2784 if (U_FAILURE(status)) {
2785 return;
2786 }
2787
2788 // Cache frequently referenced items from the compiled pattern
2789 //
2790 int64_t *pat = fPattern->fCompiledPat->getBuffer();
2791
2792 const UChar *litText = fPattern->fLiteralText.getBuffer();
2793 UVector *sets = fPattern->fSets;
2794
2795 fFrameSize = fPattern->fFrameSize;
2796 REStackFrame *fp = resetStack();
2797
2798 fp->fPatIdx = 0;
2799 fp->fInputIdx = startIdx;
2800
2801 // Zero out the pattern's static data
2802 int32_t i;
2803 for (i = 0; i<fPattern->fDataSize; i++) {
2804 fData[i] = 0;
2805 }
2806
2807 //
2808 // Main loop for interpreting the compiled pattern.
2809 // One iteration of the loop per pattern operation performed.
2810 //
2811 for (;;) {
2812 #if 0
2813 if (_heapchk() != _HEAPOK) {
2814 fprintf(stderr, "Heap Trouble\n");
2815 }
2816 #endif
2817
2818 op = (int32_t)pat[fp->fPatIdx];
2819 opType = URX_TYPE(op);
2820 opValue = URX_VAL(op);
2821 #ifdef REGEX_RUN_DEBUG
2822 if (fTraceDebug) {
2823 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2824 printf("inputIdx=%d inputChar=%x sp=%3d activeLimit=%d ", fp->fInputIdx,
2825 UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(), fActiveLimit);
2826 fPattern->dumpOp(fp->fPatIdx);
2827 }
2828 #endif
2829 fp->fPatIdx++;
2830
2831 switch (opType) {
2832
2833
2834 case URX_NOP:
2835 break;
2836
2837
2838 case URX_BACKTRACK:
2839 // Force a backtrack. In some circumstances, the pattern compiler
2840 // will notice that the pattern can't possibly match anything, and will
2841 // emit one of these at that point.
2842 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2843 break;
2844
2845
2846 case URX_ONECHAR:
2847 if (fp->fInputIdx < fActiveLimit) {
2848 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2849 UChar32 c = UTEXT_NEXT32(fInputText);
2850 if (c == opValue) {
2851 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
2852 break;
2853 }
2854 } else {
2855 fHitEnd = TRUE;
2856 }
2857
2858 #ifdef REGEX_SMART_BACKTRACKING
2859 if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) {
2860 REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
2861 if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
2862 UBool success = FALSE;
2863 UChar32 c = UTEXT_PREVIOUS32(fInputText);
2864 while (UTEXT_GETNATIVEINDEX(fInputText) >= backSearchIndex) {
2865 if (c == opValue) {
2866 success = TRUE;
2867 break;
2868 } else if (c == U_SENTINEL) {
2869 break;
2870 }
2871 c = UTEXT_PREVIOUS32(fInputText);
2872 }
2873 if (success) {
2874 fHitEnd = FALSE;
2875 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2876 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
2877 if (fp->fInputIdx > backSearchIndex) {
2878 fp = StateSave(fp, fp->fPatIdx, status);
2879 }
2880 fp->fPatIdx++; // Skip the LOOP_C, we just did that
2881 break;
2882 }
2883 }
2884 }
2885 #endif
2886
2887 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2888 break;
2889
2890
2891 case URX_STRING:
2892 {
2893 // Test input against a literal string.
2894 // Strings require two slots in the compiled pattern, one for the
2895 // offset to the string text, and one for the length.
2896 int32_t stringStartIdx = opValue;
2897 int32_t stringLen;
2898
2899 op = (int32_t)pat[fp->fPatIdx]; // Fetch the second operand
2900 fp->fPatIdx++;
2901 opType = URX_TYPE(op);
2902 stringLen = URX_VAL(op);
2903 U_ASSERT(opType == URX_STRING_LEN);
2904 U_ASSERT(stringLen >= 2);
2905
2906 const UChar *patternChars = litText+stringStartIdx;
2907 const UChar *patternEnd = patternChars+stringLen;
2908
2909 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2910 UChar32 c;
2911 UBool success = TRUE;
2912
2913 while (patternChars < patternEnd && success) {
2914 c = UTEXT_NEXT32(fInputText);
2915
2916 if (c != U_SENTINEL && UTEXT_GETNATIVEINDEX(fInputText) <= fActiveLimit) {
2917 if (U_IS_BMP(c)) {
2918 success = (*patternChars == c);
2919 patternChars += 1;
2920 } else if (patternChars+1 < patternEnd) {
2921 success = (*patternChars == U16_LEAD(c) && *(patternChars+1) == U16_TRAIL(c));
2922 patternChars += 2;
2923 }
2924 } else {
2925 success = FALSE;
2926 fHitEnd = TRUE; // TODO: See ticket 6074
2927 }
2928 }
2929
2930 if (success) {
2931 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
2932 } else {
2933 #ifdef REGEX_SMART_BACKTRACKING
2934 if (fp->fInputIdx > backSearchIndex && fStack->size()) {
2935 REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
2936 if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
2937 // Reset to last start point
2938 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2939 patternChars = litText+stringStartIdx;
2940
2941 // Search backwards for a possible start
2942 do {
2943 c = UTEXT_PREVIOUS32(fInputText);
2944 if (c == U_SENTINEL) {
2945 break;
2946 } else if ((U_IS_BMP(c) && *patternChars == c) ||
2947 (*patternChars == U16_LEAD(c) && *(patternChars+1) == U16_TRAIL(c))) {
2948 success = TRUE;
2949 break;
2950 }
2951 } while (UTEXT_GETNATIVEINDEX(fInputText) >= backSearchIndex);
2952
2953 // And try again
2954 if (success) {
2955 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2956 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
2957 if (fp->fInputIdx > backSearchIndex) {
2958 fp = StateSave(fp, fp->fPatIdx, status);
2959 }
2960 fp->fPatIdx++; // Skip the LOOP_C, we just did that
2961 break;
2962 }
2963 }
2964 }
2965 #endif
2966 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2967 }
2968 }
2969 break;
2970
2971
2972 case URX_STATE_SAVE:
2973 fp = StateSave(fp, opValue, status);
2974 break;
2975
2976
2977 case URX_END:
2978 // The match loop will exit via this path on a successful match,
2979 // when we reach the end of the pattern.
2980 if (toEnd && fp->fInputIdx != fActiveLimit) {
2981 // The pattern matched, but not to the end of input. Try some more.
2982 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2983 break;
2984 }
2985 isMatch = TRUE;
2986 goto breakFromLoop;
2987
2988 // Start and End Capture stack frame variables are laid out out like this:
2989 // fp->fExtra[opValue] - The start of a completed capture group
2990 // opValue+1 - The end of a completed capture group
2991 // opValue+2 - the start of a capture group whose end
2992 // has not yet been reached (and might not ever be).
2993 case URX_START_CAPTURE:
2994 U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
2995 fp->fExtra[opValue+2] = fp->fInputIdx;
2996 break;
2997
2998
2999 case URX_END_CAPTURE:
3000 U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
3001 U_ASSERT(fp->fExtra[opValue+2] >= 0); // Start pos for this group must be set.
3002 fp->fExtra[opValue] = fp->fExtra[opValue+2]; // Tentative start becomes real.
3003 fp->fExtra[opValue+1] = fp->fInputIdx; // End position
3004 U_ASSERT(fp->fExtra[opValue] <= fp->fExtra[opValue+1]);
3005 break;
3006
3007
3008 case URX_DOLLAR: // $, test for End of line
3009 // or for position before new line at end of input
3010 {
3011 if (fp->fInputIdx >= fAnchorLimit) {
3012 // We really are at the end of input. Success.
3013 fHitEnd = TRUE;
3014 fRequireEnd = TRUE;
3015 break;
3016 }
3017
3018 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3019
3020 // If we are positioned just before a new-line that is located at the
3021 // end of input, succeed.
3022 UChar32 c = UTEXT_NEXT32(fInputText);
3023 if (UTEXT_GETNATIVEINDEX(fInputText) >= fAnchorLimit) {
3024 if ((c>=0x0a && c<=0x0d) || c==0x85 || c==0x2028 || c==0x2029) {
3025 // If not in the middle of a CR/LF sequence
3026 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && ((void)UTEXT_PREVIOUS32(fInputText), UTEXT_PREVIOUS32(fInputText))==0x0d)) {
3027 // At new-line at end of input. Success
3028 fHitEnd = TRUE;
3029 fRequireEnd = TRUE;
3030
3031 break;
3032 }
3033 }
3034 } else {
3035 UChar32 nextC = UTEXT_NEXT32(fInputText);
3036 if (c == 0x0d && nextC == 0x0a && UTEXT_GETNATIVEINDEX(fInputText) >= fAnchorLimit) {
3037 fHitEnd = TRUE;
3038 fRequireEnd = TRUE;
3039 break; // At CR/LF at end of input. Success
3040 }
3041 }
3042
3043 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3044 }
3045 break;
3046
3047
3048 case URX_DOLLAR_D: // $, test for End of Line, in UNIX_LINES mode.
3049 if (fp->fInputIdx >= fAnchorLimit) {
3050 // Off the end of input. Success.
3051 fHitEnd = TRUE;
3052 fRequireEnd = TRUE;
3053 break;
3054 } else {
3055 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3056 UChar32 c = UTEXT_NEXT32(fInputText);
3057 // Either at the last character of input, or off the end.
3058 if (c == 0x0a && UTEXT_GETNATIVEINDEX(fInputText) == fAnchorLimit) {
3059 fHitEnd = TRUE;
3060 fRequireEnd = TRUE;
3061 break;
3062 }
3063 }
3064
3065 // Not at end of input. Back-track out.
3066 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3067 break;
3068
3069
3070 case URX_DOLLAR_M: // $, test for End of line in multi-line mode
3071 {
3072 if (fp->fInputIdx >= fAnchorLimit) {
3073 // We really are at the end of input. Success.
3074 fHitEnd = TRUE;
3075 fRequireEnd = TRUE;
3076 break;
3077 }
3078 // If we are positioned just before a new-line, succeed.
3079 // It makes no difference where the new-line is within the input.
3080 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3081 UChar32 c = UTEXT_CURRENT32(fInputText);
3082 if ((c>=0x0a && c<=0x0d) || c==0x85 ||c==0x2028 || c==0x2029) {
3083 // At a line end, except for the odd chance of being in the middle of a CR/LF sequence
3084 // In multi-line mode, hitting a new-line just before the end of input does not
3085 // set the hitEnd or requireEnd flags
3086 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && UTEXT_PREVIOUS32(fInputText)==0x0d)) {
3087 break;
3088 }
3089 }
3090 // not at a new line. Fail.
3091 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3092 }
3093 break;
3094
3095
3096 case URX_DOLLAR_MD: // $, test for End of line in multi-line and UNIX_LINES mode
3097 {
3098 if (fp->fInputIdx >= fAnchorLimit) {
3099 // We really are at the end of input. Success.
3100 fHitEnd = TRUE;
3101 fRequireEnd = TRUE; // Java set requireEnd in this case, even though
3102 break; // adding a new-line would not lose the match.
3103 }
3104 // If we are not positioned just before a new-line, the test fails; backtrack out.
3105 // It makes no difference where the new-line is within the input.
3106 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3107 if (UTEXT_CURRENT32(fInputText) != 0x0a) {
3108 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3109 }
3110 }
3111 break;
3112
3113
3114 case URX_CARET: // ^, test for start of line
3115 if (fp->fInputIdx != fAnchorStart) {
3116 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3117 }
3118 break;
3119
3120
3121 case URX_CARET_M: // ^, test for start of line in mulit-line mode
3122 {
3123 if (fp->fInputIdx == fAnchorStart) {
3124 // We are at the start input. Success.
3125 break;
3126 }
3127 // Check whether character just before the current pos is a new-line
3128 // unless we are at the end of input
3129 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3130 UChar32 c = UTEXT_PREVIOUS32(fInputText);
3131 if ((fp->fInputIdx < fAnchorLimit) &&
3132 ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) {
3133 // It's a new-line. ^ is true. Success.
3134 // TODO: what should be done with positions between a CR and LF?
3135 break;
3136 }
3137 // Not at the start of a line. Fail.
3138 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3139 }
3140 break;
3141
3142
3143 case URX_CARET_M_UNIX: // ^, test for start of line in mulit-line + Unix-line mode
3144 {
3145 U_ASSERT(fp->fInputIdx >= fAnchorStart);
3146 if (fp->fInputIdx <= fAnchorStart) {
3147 // We are at the start input. Success.
3148 break;
3149 }
3150 // Check whether character just before the current pos is a new-line
3151 U_ASSERT(fp->fInputIdx <= fAnchorLimit);
3152 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3153 UChar32 c = UTEXT_PREVIOUS32(fInputText);
3154 if (c != 0x0a) {
3155 // Not at the start of a line. Back-track out.
3156 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3157 }
3158 }
3159 break;
3160
3161 case URX_BACKSLASH_B: // Test for word boundaries
3162 {
3163 UBool success = isWordBoundary(fp->fInputIdx);
3164 success ^= (opValue != 0); // flip sense for \B
3165 if (!success) {
3166 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3167 }
3168 }
3169 break;
3170
3171
3172 case URX_BACKSLASH_BU: // Test for word boundaries, Unicode-style
3173 {
3174 UBool success = isUWordBoundary(fp->fInputIdx);
3175 success ^= (opValue != 0); // flip sense for \B
3176 if (!success) {
3177 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3178 }
3179 }
3180 break;
3181
3182
3183 case URX_BACKSLASH_D: // Test for decimal digit
3184 {
3185 if (fp->fInputIdx >= fActiveLimit) {
3186 fHitEnd = TRUE;
3187 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3188 break;
3189 }
3190
3191 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3192
3193 UChar32 c = UTEXT_NEXT32(fInputText);
3194 int8_t ctype = u_charType(c); // TODO: make a unicode set for this. Will be faster.
3195 UBool success = (ctype == U_DECIMAL_DIGIT_NUMBER);
3196 success ^= (opValue != 0); // flip sense for \D
3197 if (success) {
3198 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3199 } else {
3200 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3201 }
3202 }
3203 break;
3204
3205
3206 case URX_BACKSLASH_G: // Test for position at end of previous match
3207 if (!((fMatch && fp->fInputIdx==fMatchEnd) || (fMatch==FALSE && fp->fInputIdx==fActiveStart))) {
3208 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3209 }
3210 break;
3211
3212
3213 case URX_BACKSLASH_X:
3214 // Match a Grapheme, as defined by Unicode TR 29.
3215 // Differs slightly from Perl, which consumes combining marks independently
3216 // of context.
3217 {
3218
3219 // Fail if at end of input
3220 if (fp->fInputIdx >= fActiveLimit) {
3221 fHitEnd = TRUE;
3222 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3223 break;
3224 }
3225
3226 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3227
3228 // Examine (and consume) the current char.
3229 // Dispatch into a little state machine, based on the char.
3230 UChar32 c;
3231 c = UTEXT_NEXT32(fInputText);
3232 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3233 UnicodeSet **sets = fPattern->fStaticSets;
3234 if (sets[URX_GC_NORMAL]->contains(c)) goto GC_Extend;
3235 if (sets[URX_GC_CONTROL]->contains(c)) goto GC_Control;
3236 if (sets[URX_GC_L]->contains(c)) goto GC_L;
3237 if (sets[URX_GC_LV]->contains(c)) goto GC_V;
3238 if (sets[URX_GC_LVT]->contains(c)) goto GC_T;
3239 if (sets[URX_GC_V]->contains(c)) goto GC_V;
3240 if (sets[URX_GC_T]->contains(c)) goto GC_T;
3241 goto GC_Extend;
3242
3243
3244
3245 GC_L:
3246 if (fp->fInputIdx >= fActiveLimit) goto GC_Done;
3247 c = UTEXT_NEXT32(fInputText);
3248 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3249 if (sets[URX_GC_L]->contains(c)) goto GC_L;
3250 if (sets[URX_GC_LV]->contains(c)) goto GC_V;
3251 if (sets[URX_GC_LVT]->contains(c)) goto GC_T;
3252 if (sets[URX_GC_V]->contains(c)) goto GC_V;
3253 (void)UTEXT_PREVIOUS32(fInputText);
3254 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3255 goto GC_Extend;
3256
3257 GC_V:
3258 if (fp->fInputIdx >= fActiveLimit) goto GC_Done;
3259 c = UTEXT_NEXT32(fInputText);
3260 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3261 if (sets[URX_GC_V]->contains(c)) goto GC_V;
3262 if (sets[URX_GC_T]->contains(c)) goto GC_T;
3263 (void)UTEXT_PREVIOUS32(fInputText);
3264 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3265 goto GC_Extend;
3266
3267 GC_T:
3268 if (fp->fInputIdx >= fActiveLimit) goto GC_Done;
3269 c = UTEXT_NEXT32(fInputText);
3270 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3271 if (sets[URX_GC_T]->contains(c)) goto GC_T;
3272 (void)UTEXT_PREVIOUS32(fInputText);
3273 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3274 goto GC_Extend;
3275
3276 GC_Extend:
3277 // Combining characters are consumed here
3278 for (;;) {
3279 if (fp->fInputIdx >= fActiveLimit) {
3280 break;
3281 }
3282 c = UTEXT_CURRENT32(fInputText);
3283 if (sets[URX_GC_EXTEND]->contains(c) == FALSE) {
3284 break;
3285 }
3286 (void)UTEXT_NEXT32(fInputText);
3287 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3288 }
3289 goto GC_Done;
3290
3291 GC_Control:
3292 // Most control chars stand alone (don't combine with combining chars),
3293 // except for that CR/LF sequence is a single grapheme cluster.
3294 if (c == 0x0d && fp->fInputIdx < fActiveLimit && UTEXT_CURRENT32(fInputText) == 0x0a) {
3295 c = UTEXT_NEXT32(fInputText);
3296 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3297 }
3298
3299 GC_Done:
3300 if (fp->fInputIdx >= fActiveLimit) {
3301 fHitEnd = TRUE;
3302 }
3303 break;
3304 }
3305
3306
3307
3308
3309 case URX_BACKSLASH_Z: // Test for end of Input
3310 if (fp->fInputIdx < fAnchorLimit) {
3311 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3312 } else {
3313 fHitEnd = TRUE;
3314 fRequireEnd = TRUE;
3315 }
3316 break;
3317
3318
3319
3320 case URX_STATIC_SETREF:
3321 {
3322 // Test input character against one of the predefined sets
3323 // (Word Characters, for example)
3324 // The high bit of the op value is a flag for the match polarity.
3325 // 0: success if input char is in set.
3326 // 1: success if input char is not in set.
3327 if (fp->fInputIdx >= fActiveLimit) {
3328 fHitEnd = TRUE;
3329 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3330 break;
3331 }
3332
3333 UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET);
3334 opValue &= ~URX_NEG_SET;
3335 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
3336
3337 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3338 UChar32 c = UTEXT_NEXT32(fInputText);
3339 if (c < 256) {
3340 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
3341 if (s8->contains(c)) {
3342 success = !success;
3343 }
3344 } else {
3345 const UnicodeSet *s = fPattern->fStaticSets[opValue];
3346 if (s->contains(c)) {
3347 success = !success;
3348 }
3349 }
3350 if (success) {
3351 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3352 } else {
3353 // the character wasn't in the set.
3354 #ifdef REGEX_SMART_BACKTRACKING
3355 if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) {
3356 REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
3357 if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
3358 // Try to find it, backwards
3359 UTEXT_PREVIOUS32(fInputText); // skip the first character we tried
3360 success = ((opValue & URX_NEG_SET) == URX_NEG_SET); // reset
3361 do {
3362 c = UTEXT_PREVIOUS32(fInputText);
3363 if (c == U_SENTINEL) {
3364 break;
3365 } else if (c < 256) {
3366 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
3367 if (s8->contains(c)) {
3368 success = !success;
3369 }
3370 } else {
3371 const UnicodeSet *s = fPattern->fStaticSets[opValue];
3372 if (s->contains(c)) {
3373 success = !success;
3374 }
3375 }
3376 } while (UTEXT_GETNATIVEINDEX(fInputText) >= backSearchIndex && !success);
3377
3378 if (success && c != U_SENTINEL) {
3379 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3380 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3381 if (fp->fInputIdx > backSearchIndex) {
3382 fp = StateSave(fp, fp->fPatIdx, status);
3383 }
3384 fp->fPatIdx++; // Skip the LOOP_C, we just did that
3385 break;
3386 }
3387 }
3388 }
3389 #endif
3390 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3391 }
3392 }
3393 break;
3394
3395
3396 case URX_STAT_SETREF_N:
3397 {
3398 // Test input character for NOT being a member of one of
3399 // the predefined sets (Word Characters, for example)
3400 if (fp->fInputIdx >= fActiveLimit) {
3401 fHitEnd = TRUE;
3402 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3403 break;
3404 }
3405
3406 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
3407
3408 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3409
3410 UChar32 c = UTEXT_NEXT32(fInputText);
3411 if (c < 256) {
3412 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
3413 if (s8->contains(c) == FALSE) {
3414 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3415 break;
3416 }
3417 } else {
3418 const UnicodeSet *s = fPattern->fStaticSets[opValue];
3419 if (s->contains(c) == FALSE) {
3420 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3421 break;
3422 }
3423 }
3424 // the character wasn't in the set.
3425 #ifdef REGEX_SMART_BACKTRACKING
3426 if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) {
3427 REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
3428 if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
3429 // Try to find it, backwards
3430 UTEXT_PREVIOUS32(fInputText); // skip the first character we tried
3431 UBool success = FALSE;
3432 do {
3433 c = UTEXT_PREVIOUS32(fInputText);
3434 if (c == U_SENTINEL) {
3435 break;
3436 } else if (c < 256) {
3437 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
3438 if (s8->contains(c) == FALSE) {
3439 success = TRUE;
3440 break;
3441 }
3442 } else {
3443 const UnicodeSet *s = fPattern->fStaticSets[opValue];
3444 if (s->contains(c) == FALSE) {
3445 success = TRUE;
3446 break;
3447 }
3448 }
3449 } while (UTEXT_GETNATIVEINDEX(fInputText) >= backSearchIndex);
3450
3451 if (success) {
3452 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3453 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3454 if (fp->fInputIdx > backSearchIndex) {
3455 fp = StateSave(fp, fp->fPatIdx, status);
3456 }
3457 fp->fPatIdx++; // Skip the LOOP_C, we just did that
3458 break;
3459 }
3460 }
3461 }
3462 #endif
3463 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3464 }
3465 break;
3466
3467
3468 case URX_SETREF:
3469 if (fp->fInputIdx >= fActiveLimit) {
3470 fHitEnd = TRUE;
3471 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3472 break;
3473 } else {
3474 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3475
3476 // There is input left. Pick up one char and test it for set membership.
3477 UChar32 c = UTEXT_NEXT32(fInputText);
3478 U_ASSERT(opValue > 0 && opValue < sets->size());
3479 if (c<256) {
3480 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
3481 if (s8->contains(c)) {
3482 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3483 break;
3484 }
3485 } else {
3486 UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue);
3487 if (s->contains(c)) {
3488 // The character is in the set. A Match.
3489 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3490 break;
3491 }
3492 }
3493
3494 // the character wasn't in the set.
3495 #ifdef REGEX_SMART_BACKTRACKING
3496 if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) {
3497 REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
3498 if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
3499 // Try to find it, backwards
3500 UTEXT_PREVIOUS32(fInputText); // skip the first character we tried
3501 UBool success = FALSE;
3502 do {
3503 c = UTEXT_PREVIOUS32(fInputText);
3504 if (c == U_SENTINEL) {
3505 break;
3506 } else if (c < 256) {
3507 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
3508 if (s8->contains(c)) {
3509 success = TRUE;
3510 break;
3511 }
3512 } else {
3513 UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue);
3514 if (s->contains(c)) {
3515 success = TRUE;
3516 break;
3517 }
3518 }
3519 } while (UTEXT_GETNATIVEINDEX(fInputText) >= backSearchIndex);
3520
3521 if (success) {
3522 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3523 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3524 if (fp->fInputIdx > backSearchIndex) {
3525 fp = StateSave(fp, fp->fPatIdx, status);
3526 }
3527 fp->fPatIdx++; // Skip the LOOP_C, we just did that
3528 break;
3529 }
3530 }
3531 }
3532 #endif
3533 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3534 }
3535 break;
3536
3537
3538 case URX_DOTANY:
3539 {
3540 // . matches anything, but stops at end-of-line.
3541 if (fp->fInputIdx >= fActiveLimit) {
3542 // At end of input. Match failed. Backtrack out.
3543 fHitEnd = TRUE;
3544 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3545 break;
3546 }
3547
3548 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3549
3550 // There is input left. Advance over one char, unless we've hit end-of-line
3551 UChar32 c = UTEXT_NEXT32(fInputText);
3552 if (((c & 0x7f) <= 0x29) && // First quickly bypass as many chars as possible
3553 ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) {
3554 // End of line in normal mode. . does not match.
3555 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3556 break;
3557 }
3558 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3559 }
3560 break;
3561
3562
3563 case URX_DOTANY_ALL:
3564 {
3565 // ., in dot-matches-all (including new lines) mode
3566 if (fp->fInputIdx >= fActiveLimit) {
3567 // At end of input. Match failed. Backtrack out.
3568 fHitEnd = TRUE;
3569 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3570 break;
3571 }
3572
3573 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3574
3575 // There is input left. Advance over one char, except if we are
3576 // at a cr/lf, advance over both of them.
3577 UChar32 c;
3578 c = UTEXT_NEXT32(fInputText);
3579 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3580 if (c==0x0d && fp->fInputIdx < fActiveLimit) {
3581 // In the case of a CR/LF, we need to advance over both.
3582 UChar32 nextc = UTEXT_CURRENT32(fInputText);
3583 if (nextc == 0x0a) {
3584 (void)UTEXT_NEXT32(fInputText);
3585 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3586 }
3587 }
3588 }
3589 break;
3590
3591
3592 case URX_DOTANY_UNIX:
3593 {
3594 // '.' operator, matches all, but stops at end-of-line.
3595 // UNIX_LINES mode, so 0x0a is the only recognized line ending.
3596 if (fp->fInputIdx >= fActiveLimit) {
3597 // At end of input. Match failed. Backtrack out.
3598 fHitEnd = TRUE;
3599 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3600 break;
3601 }
3602
3603 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3604
3605 // There is input left. Advance over one char, unless we've hit end-of-line
3606 UChar32 c = UTEXT_NEXT32(fInputText);
3607 if (c == 0x0a) {
3608 // End of line in normal mode. '.' does not match the \n
3609 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3610 } else {
3611 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3612 }
3613 }
3614 break;
3615
3616
3617 case URX_JMP:
3618 fp->fPatIdx = opValue;
3619 break;
3620
3621 case URX_FAIL:
3622 isMatch = FALSE;
3623 goto breakFromLoop;
3624
3625 case URX_JMP_SAV:
3626 U_ASSERT(opValue < fPattern->fCompiledPat->size());
3627 fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current
3628 fp->fPatIdx = opValue; // Then JMP.
3629 break;
3630
3631 case URX_JMP_SAV_X:
3632 // This opcode is used with (x)+, when x can match a zero length string.
3633 // Same as JMP_SAV, except conditional on the match having made forward progress.
3634 // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the
3635 // data address of the input position at the start of the loop.
3636 {
3637 U_ASSERT(opValue > 0 && opValue < fPattern->fCompiledPat->size());
3638 int32_t stoOp = (int32_t)pat[opValue-1];
3639 U_ASSERT(URX_TYPE(stoOp) == URX_STO_INP_LOC);
3640 int32_t frameLoc = URX_VAL(stoOp);
3641 U_ASSERT(frameLoc >= 0 && frameLoc < fFrameSize);
3642 int64_t prevInputIdx = fp->fExtra[frameLoc];
3643 U_ASSERT(prevInputIdx <= fp->fInputIdx);
3644 if (prevInputIdx < fp->fInputIdx) {
3645 // The match did make progress. Repeat the loop.
3646 fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current
3647 fp->fPatIdx = opValue;
3648 fp->fExtra[frameLoc] = fp->fInputIdx;
3649 }
3650 // If the input position did not advance, we do nothing here,
3651 // execution will fall out of the loop.
3652 }
3653 break;
3654
3655 case URX_CTR_INIT:
3656 {
3657 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
3658 fp->fExtra[opValue] = 0; // Set the loop counter variable to zero
3659
3660 // Pick up the three extra operands that CTR_INIT has, and
3661 // skip the pattern location counter past
3662 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
3663 fp->fPatIdx += 3;
3664 int32_t loopLoc = URX_VAL(pat[instrOperandLoc]);
3665 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
3666 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
3667 U_ASSERT(minCount>=0);
3668 U_ASSERT(maxCount>=minCount || maxCount==-1);
3669 U_ASSERT(loopLoc>fp->fPatIdx);
3670
3671 if (minCount == 0) {
3672 fp = StateSave(fp, loopLoc+1, status);
3673 }
3674 if (maxCount == 0) {
3675 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3676 }
3677 }
3678 break;
3679
3680 case URX_CTR_LOOP:
3681 {
3682 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
3683 int32_t initOp = (int32_t)pat[opValue];
3684 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT);
3685 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
3686 int32_t minCount = (int32_t)pat[opValue+2];
3687 int32_t maxCount = (int32_t)pat[opValue+3];
3688 // Increment the counter. Note: we DIDN'T worry about counter
3689 // overflow, since the data comes from UnicodeStrings, which
3690 // stores its length in an int32_t. Do we have to think about
3691 // this now that we're using UText? Probably not, since the length
3692 // in UChar32s is still an int32_t.
3693 (*pCounter)++;
3694 U_ASSERT(*pCounter > 0);
3695 if ((uint64_t)*pCounter >= (uint32_t)maxCount) {
3696 U_ASSERT(*pCounter == maxCount || maxCount == -1);
3697 break;
3698 }
3699 if (*pCounter >= minCount) {
3700 fp = StateSave(fp, fp->fPatIdx, status);
3701 }
3702 fp->fPatIdx = opValue + 4; // Loop back.
3703 }
3704 break;
3705
3706 case URX_CTR_INIT_NG:
3707 {
3708 // Initialize a non-greedy loop
3709 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
3710 fp->fExtra[opValue] = 0; // Set the loop counter variable to zero
3711
3712 // Pick up the three extra operands that CTR_INIT has, and
3713 // skip the pattern location counter past
3714 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
3715 fp->fPatIdx += 3;
3716 int32_t loopLoc = URX_VAL(pat[instrOperandLoc]);
3717 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
3718 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
3719 U_ASSERT(minCount>=0);
3720 U_ASSERT(maxCount>=minCount || maxCount==-1);
3721 U_ASSERT(loopLoc>fp->fPatIdx);
3722
3723 if (minCount == 0) {
3724 if (maxCount != 0) {
3725 fp = StateSave(fp, fp->fPatIdx, status);
3726 }
3727 fp->fPatIdx = loopLoc+1; // Continue with stuff after repeated block
3728 }
3729 }
3730 break;
3731
3732 case URX_CTR_LOOP_NG:
3733 {
3734 // Non-greedy {min, max} loops
3735 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
3736 int32_t initOp = (int32_t)pat[opValue];
3737 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT_NG);
3738 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
3739 int32_t minCount = (int32_t)pat[opValue+2];
3740 int32_t maxCount = (int32_t)pat[opValue+3];
3741 // Increment the counter. Note: we DIDN'T worry about counter
3742 // overflow, since the data comes from UnicodeStrings, which
3743 // stores its length in an int32_t. Do we have to think about
3744 // this now that we're using UText? Probably not, since the length
3745 // in UChar32s is still an int32_t.
3746 (*pCounter)++;
3747 U_ASSERT(*pCounter > 0);
3748
3749 if ((uint64_t)*pCounter >= (uint32_t)maxCount) {
3750 // The loop has matched the maximum permitted number of times.
3751 // Break out of here with no action. Matching will
3752 // continue with the following pattern.
3753 U_ASSERT(*pCounter == maxCount || maxCount == -1);
3754 break;
3755 }
3756
3757 if (*pCounter < minCount) {
3758 // We haven't met the minimum number of matches yet.
3759 // Loop back for another one.
3760 fp->fPatIdx = opValue + 4; // Loop back.
3761 } else {
3762 // We do have the minimum number of matches.
3763 // Fall into the following pattern, but first do
3764 // a state save to the top of the loop, so that a failure
3765 // in the following pattern will try another iteration of the loop.
3766 fp = StateSave(fp, opValue + 4, status);
3767 }
3768 }
3769 break;
3770
3771 case URX_STO_SP:
3772 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
3773 fData[opValue] = fStack->size();
3774 break;
3775
3776 case URX_LD_SP:
3777 {
3778 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
3779 int32_t newStackSize = (int32_t)fData[opValue];
3780 U_ASSERT(newStackSize <= fStack->size());
3781 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
3782 if (newFP == (int64_t *)fp) {
3783 break;
3784 }
3785 int32_t i;
3786 for (i=0; i<fFrameSize; i++) {
3787 newFP[i] = ((int64_t *)fp)[i];
3788 }
3789 fp = (REStackFrame *)newFP;
3790 fStack->setSize(newStackSize);
3791 }
3792 break;
3793
3794 case URX_BACKREF:
3795 case URX_BACKREF_I:
3796 {
3797 U_ASSERT(opValue < fFrameSize);
3798 int64_t groupStartIdx = fp->fExtra[opValue];
3799 int64_t groupEndIdx = fp->fExtra[opValue+1];
3800 U_ASSERT(groupStartIdx <= groupEndIdx);
3801 if (groupStartIdx < 0) {
3802 // This capture group has not participated in the match thus far,
3803 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match.
3804 }
3805
3806 if (groupEndIdx == groupStartIdx) {
3807 // The capture group match was of an empty string.
3808 // Verified by testing: Perl matches succeed in this case, so
3809 // we do too.
3810 break;
3811 }
3812
3813 UTEXT_SETNATIVEINDEX(fAltInputText, groupStartIdx);
3814 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3815
3816 UBool haveMatch = (opType == URX_BACKREF ?
3817 (0 == utext_compareNativeLimit(fAltInputText, groupEndIdx, fInputText, -1)) :
3818 (0 == utext_caseCompareNativeLimit(fAltInputText, groupEndIdx, fInputText, -1, U_FOLD_CASE_DEFAULT, &status)));
3819 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3820
3821 if (fp->fInputIdx > fActiveLimit) {
3822 fHitEnd = TRUE;
3823 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match.
3824 } else if (!haveMatch) {
3825 if (fp->fInputIdx == fActiveLimit) {
3826 fHitEnd = TRUE;
3827 }
3828 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match.
3829 }
3830 }
3831 break;
3832
3833 case URX_STO_INP_LOC:
3834 {
3835 U_ASSERT(opValue >= 0 && opValue < fFrameSize);
3836 fp->fExtra[opValue] = fp->fInputIdx;
3837 }
3838 break;
3839
3840 case URX_JMPX:
3841 {
3842 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
3843 fp->fPatIdx += 1;
3844 int32_t dataLoc = URX_VAL(pat[instrOperandLoc]);
3845 U_ASSERT(dataLoc >= 0 && dataLoc < fFrameSize);
3846 int64_t savedInputIdx = fp->fExtra[dataLoc];
3847 U_ASSERT(savedInputIdx <= fp->fInputIdx);
3848 if (savedInputIdx < fp->fInputIdx) {
3849 fp->fPatIdx = opValue; // JMP
3850 } else {
3851 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no progress in loop.
3852 }
3853 }
3854 break;
3855
3856 case URX_LA_START:
3857 {
3858 // Entering a lookahead block.
3859 // Save Stack Ptr, Input Pos.
3860 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
3861 fData[opValue] = fStack->size();
3862 fData[opValue+1] = fp->fInputIdx;
3863 fActiveStart = fLookStart; // Set the match region change for
3864 fActiveLimit = fLookLimit; // transparent bounds.
3865 }
3866 break;
3867
3868 case URX_LA_END:
3869 {
3870 // Leaving a look-ahead block.
3871 // restore Stack Ptr, Input Pos to positions they had on entry to block.
3872 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
3873 int32_t stackSize = fStack->size();
3874 int32_t newStackSize =(int32_t)fData[opValue];
3875 U_ASSERT(stackSize >= newStackSize);
3876 if (stackSize > newStackSize) {
3877 // Copy the current top frame back to the new (cut back) top frame.
3878 // This makes the capture groups from within the look-ahead
3879 // expression available.
3880 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
3881 int32_t i;
3882 for (i=0; i<fFrameSize; i++) {
3883 newFP[i] = ((int64_t *)fp)[i];
3884 }
3885 fp = (REStackFrame *)newFP;
3886 fStack->setSize(newStackSize);
3887 }
3888 fp->fInputIdx = fData[opValue+1];
3889
3890 // Restore the active region bounds in the input string; they may have
3891 // been changed because of transparent bounds on a Region.
3892 fActiveStart = fRegionStart;
3893 fActiveLimit = fRegionLimit;
3894 }
3895 break;
3896
3897 case URX_ONECHAR_I:
3898 if (fp->fInputIdx < fActiveLimit) {
3899 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3900
3901 UChar32 c = UTEXT_NEXT32(fInputText);
3902 if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) {
3903 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3904 break;
3905 }
3906 } else {
3907 fHitEnd = TRUE;
3908 }
3909
3910 #ifdef REGEX_SMART_BACKTRACKING
3911 if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) {
3912 REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
3913 if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
3914 UBool success = FALSE;
3915 UChar32 c = UTEXT_PREVIOUS32(fInputText);
3916 while (UTEXT_GETNATIVEINDEX(fInputText) >= backSearchIndex) {
3917 if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) {
3918 success = TRUE;
3919 break;
3920 } else if (c == U_SENTINEL) {
3921 break;
3922 }
3923 c = UTEXT_PREVIOUS32(fInputText);
3924 }
3925 if (success) {
3926 fHitEnd = FALSE;
3927 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3928 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3929 if (fp->fInputIdx > backSearchIndex) {
3930 fp = StateSave(fp, fp->fPatIdx, status);
3931 }
3932 fp->fPatIdx++; // Skip the LOOP_C, we just did that
3933 break;
3934 }
3935 }
3936 }
3937 #endif
3938
3939 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3940 break;
3941
3942 case URX_STRING_I:
3943 {
3944 // Test input against a literal string.
3945 // Strings require two slots in the compiled pattern, one for the
3946 // offset to the string text, and one for the length.
3947 const UCaseProps *csp = ucase_getSingleton();
3948 {
3949 int32_t stringStartIdx, stringLen;
3950 stringStartIdx = opValue;
3951
3952 op = (int32_t)pat[fp->fPatIdx];
3953 fp->fPatIdx++;
3954 opType = URX_TYPE(op);
3955 opValue = URX_VAL(op);
3956 U_ASSERT(opType == URX_STRING_LEN);
3957 stringLen = opValue;
3958
3959 const UChar *patternChars = litText+stringStartIdx;
3960 const UChar *patternEnd = patternChars+stringLen;
3961
3962 const UChar *foldChars = NULL;
3963 int32_t foldOffset, foldLength;
3964 UChar32 c;
3965
3966 foldOffset = foldLength = 0;
3967 UBool success = TRUE;
3968
3969 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3970 while (patternChars < patternEnd && success) {
3971 if(foldOffset < foldLength) {
3972 U16_NEXT_UNSAFE(foldChars, foldOffset, c);
3973 } else {
3974 c = UTEXT_NEXT32(fInputText);
3975 if (c != U_SENTINEL) {
3976 foldLength = ucase_toFullFolding(csp, c, &foldChars, U_FOLD_CASE_DEFAULT);
3977 if(foldLength >= 0) {
3978 if(foldLength <= UCASE_MAX_STRING_LENGTH) { // !!!: Does not correctly handle chars that fold to 0-length strings
3979 foldOffset = 0;
3980 U16_NEXT_UNSAFE(foldChars, foldOffset, c);
3981 } else {
3982 c = foldLength;
3983 foldLength = foldOffset; // to avoid reading chars from the folding buffer
3984 }
3985 }
3986 }
3987
3988 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3989 }
3990
3991 success = FALSE;
3992 if (c != U_SENTINEL && (fp->fInputIdx <= fActiveLimit)) {
3993 if (U_IS_BMP(c)) {
3994 success = (*patternChars == c);
3995 patternChars += 1;
3996 } else if (patternChars+1 < patternEnd) {
3997 success = (*patternChars == U16_LEAD(c) && *(patternChars+1) == U16_TRAIL(c));
3998 patternChars += 2;
3999 }
4000 } else {
4001 fHitEnd = TRUE; // TODO: See ticket 6074
4002 }
4003 }
4004
4005 if (!success) {
4006 #ifdef REGEX_SMART_BACKTRACKING
4007 if (fp->fInputIdx > backSearchIndex && fStack->size()) {
4008 REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
4009 if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
4010 // Reset to last start point
4011 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
4012 patternChars = litText+stringStartIdx;
4013
4014 // Search backwards for a possible start
4015 do {
4016 c = UTEXT_PREVIOUS32(fInputText);
4017 if (c == U_SENTINEL) {
4018 break;
4019 } else {
4020 foldLength = ucase_toFullFolding(csp, c, &foldChars, U_FOLD_CASE_DEFAULT);
4021 if(foldLength >= 0) {
4022 if(foldLength <= UCASE_MAX_STRING_LENGTH) { // !!!: Does not correctly handle chars that fold to 0-length strings
4023 foldOffset = 0;
4024 U16_NEXT_UNSAFE(foldChars, foldOffset, c);
4025 } else {
4026 c = foldLength;
4027 foldLength = foldOffset; // to avoid reading chars from the folding buffer
4028 }
4029 }
4030
4031 if ((U_IS_BMP(c) && *patternChars == c) ||
4032 (*patternChars == U16_LEAD(c) && *(patternChars+1) == U16_TRAIL(c))) {
4033 success = TRUE;
4034 break;
4035 }
4036 }
4037 } while (UTEXT_GETNATIVEINDEX(fInputText) >= backSearchIndex);
4038
4039 // And try again
4040 if (success) {
4041 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4042 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
4043 if (fp->fInputIdx > backSearchIndex) {
4044 fp = StateSave(fp, fp->fPatIdx, status);
4045 }
4046 fp->fPatIdx++; // Skip the LOOP_C, we just did that
4047 break;
4048 }
4049 }
4050 }
4051 #endif
4052 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4053 }
4054 }
4055 }
4056 break;
4057
4058 case URX_LB_START:
4059 {
4060 // Entering a look-behind block.
4061 // Save Stack Ptr, Input Pos.
4062 // TODO: implement transparent bounds. Ticket #6067
4063 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
4064 fData[opValue] = fStack->size();
4065 fData[opValue+1] = fp->fInputIdx;
4066 // Init the variable containing the start index for attempted matches.
4067 fData[opValue+2] = -1;
4068 // Save input string length, then reset to pin any matches to end at
4069 // the current position.
4070 fData[opValue+3] = fActiveLimit;
4071 fActiveLimit = fp->fInputIdx;
4072 }
4073 break;
4074
4075
4076 case URX_LB_CONT:
4077 {
4078 // Positive Look-Behind, at top of loop checking for matches of LB expression
4079 // at all possible input starting positions.
4080
4081 // Fetch the min and max possible match lengths. They are the operands
4082 // of this op in the pattern.
4083 int32_t minML = (int32_t)pat[fp->fPatIdx++];
4084 int32_t maxML = (int32_t)pat[fp->fPatIdx++];
4085 U_ASSERT(minML <= maxML);
4086 U_ASSERT(minML >= 0);
4087
4088 // Fetch (from data) the last input index where a match was attempted.
4089 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
4090 int64_t *lbStartIdx = &fData[opValue+2];
4091 if (*lbStartIdx < 0) {
4092 // First time through loop.
4093 *lbStartIdx = fp->fInputIdx - minML;
4094 } else {
4095 // 2nd through nth time through the loop.
4096 // Back up start position for match by one.
4097 if (*lbStartIdx == 0) {
4098 (*lbStartIdx)--;
4099 } else {
4100 UTEXT_SETNATIVEINDEX(fInputText, *lbStartIdx);
4101 (void)UTEXT_PREVIOUS32(fInputText);
4102 *lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
4103 }
4104 }
4105
4106 if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) {
4107 // We have tried all potential match starting points without
4108 // getting a match. Backtrack out, and out of the
4109 // Look Behind altogether.
4110 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4111 int64_t restoreInputLen = fData[opValue+3];
4112 U_ASSERT(restoreInputLen >= fActiveLimit);
4113 U_ASSERT(restoreInputLen <= fInputLength);
4114 fActiveLimit = restoreInputLen;
4115 break;
4116 }
4117
4118 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
4119 // (successful match will fall off the end of the loop.)
4120 fp = StateSave(fp, fp->fPatIdx-3, status);
4121 fp->fInputIdx = *lbStartIdx;
4122 }
4123 break;
4124
4125 case URX_LB_END:
4126 // End of a look-behind block, after a successful match.
4127 {
4128 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
4129 if (fp->fInputIdx != fActiveLimit) {
4130 // The look-behind expression matched, but the match did not
4131 // extend all the way to the point that we are looking behind from.
4132 // FAIL out of here, which will take us back to the LB_CONT, which
4133 // will retry the match starting at another position or fail
4134 // the look-behind altogether, whichever is appropriate.
4135 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4136 break;
4137 }
4138
4139 // Look-behind match is good. Restore the orignal input string length,
4140 // which had been truncated to pin the end of the lookbehind match to the
4141 // position being looked-behind.
4142 int64_t originalInputLen = fData[opValue+3];
4143 U_ASSERT(originalInputLen >= fActiveLimit);
4144 U_ASSERT(originalInputLen <= fInputLength);
4145 fActiveLimit = originalInputLen;
4146 }
4147 break;
4148
4149
4150 case URX_LBN_CONT:
4151 {
4152 // Negative Look-Behind, at top of loop checking for matches of LB expression
4153 // at all possible input starting positions.
4154
4155 // Fetch the extra parameters of this op.
4156 int32_t minML = (int32_t)pat[fp->fPatIdx++];
4157 int32_t maxML = (int32_t)pat[fp->fPatIdx++];
4158 int32_t continueLoc = (int32_t)pat[fp->fPatIdx++];
4159 continueLoc = URX_VAL(continueLoc);
4160 U_ASSERT(minML <= maxML);
4161 U_ASSERT(minML >= 0);
4162 U_ASSERT(continueLoc > fp->fPatIdx);
4163
4164 // Fetch (from data) the last input index where a match was attempted.
4165 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
4166 int64_t *lbStartIdx = &fData[opValue+2];
4167 if (*lbStartIdx < 0) {
4168 // First time through loop.
4169 *lbStartIdx = fp->fInputIdx - minML;
4170 } else {
4171 // 2nd through nth time through the loop.
4172 // Back up start position for match by one.
4173 if (*lbStartIdx == 0) {
4174 (*lbStartIdx)--;
4175 } else {
4176 UTEXT_SETNATIVEINDEX(fInputText, *lbStartIdx);
4177 (void)UTEXT_PREVIOUS32(fInputText);
4178 *lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
4179 }
4180 }
4181
4182 if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) {
4183 // We have tried all potential match starting points without
4184 // getting a match, which means that the negative lookbehind as
4185 // a whole has succeeded. Jump forward to the continue location
4186 int64_t restoreInputLen = fData[opValue+3];
4187 U_ASSERT(restoreInputLen >= fActiveLimit);
4188 U_ASSERT(restoreInputLen <= fInputLength);
4189 fActiveLimit = restoreInputLen;
4190 fp->fPatIdx = continueLoc;
4191 break;
4192 }
4193
4194 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
4195 // (successful match will cause a FAIL out of the loop altogether.)
4196 fp = StateSave(fp, fp->fPatIdx-4, status);
4197 fp->fInputIdx = *lbStartIdx;
4198 }
4199 break;
4200
4201 case URX_LBN_END:
4202 // End of a negative look-behind block, after a successful match.
4203 {
4204 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
4205 if (fp->fInputIdx != fActiveLimit) {
4206 // The look-behind expression matched, but the match did not
4207 // extend all the way to the point that we are looking behind from.
4208 // FAIL out of here, which will take us back to the LB_CONT, which
4209 // will retry the match starting at another position or succeed
4210 // the look-behind altogether, whichever is appropriate.
4211 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4212 break;
4213 }
4214
4215 // Look-behind expression matched, which means look-behind test as
4216 // a whole Fails
4217
4218 // Restore the orignal input string length, which had been truncated
4219 // inorder to pin the end of the lookbehind match
4220 // to the position being looked-behind.
4221 int64_t originalInputLen = fData[opValue+3];
4222 U_ASSERT(originalInputLen >= fActiveLimit);
4223 U_ASSERT(originalInputLen <= fInputLength);
4224 fActiveLimit = originalInputLen;
4225
4226 // Restore original stack position, discarding any state saved
4227 // by the successful pattern match.
4228 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
4229 int32_t newStackSize = (int32_t)fData[opValue];
4230 U_ASSERT(fStack->size() > newStackSize);
4231 fStack->setSize(newStackSize);
4232
4233 // FAIL, which will take control back to someplace
4234 // prior to entering the look-behind test.
4235 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4236 }
4237 break;
4238
4239
4240 case URX_LOOP_SR_I:
4241 // Loop Initialization for the optimized implementation of
4242 // [some character set]*
4243 // This op scans through all matching input.
4244 // The following LOOP_C op emulates stack unwinding if the following pattern fails.
4245 {
4246 U_ASSERT(opValue > 0 && opValue < sets->size());
4247 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
4248 UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue);
4249
4250 // Loop through input, until either the input is exhausted or
4251 // we reach a character that is not a member of the set.
4252 int64_t ix = fp->fInputIdx;
4253 UTEXT_SETNATIVEINDEX(fInputText, ix);
4254 for (;;) {
4255 if (ix >= fActiveLimit) {
4256 fHitEnd = TRUE;
4257 break;
4258 }
4259 UChar32 c = UTEXT_NEXT32(fInputText);
4260 if (c<256) {
4261 if (s8->contains(c) == FALSE) {
4262 break;
4263 }
4264 } else {
4265 if (s->contains(c) == FALSE) {
4266 break;
4267 }
4268 }
4269 ix = UTEXT_GETNATIVEINDEX(fInputText);
4270 }
4271
4272 // If there were no matching characters, skip over the loop altogether.
4273 // The loop doesn't run at all, a * op always succeeds.
4274 if (ix == fp->fInputIdx) {
4275 fp->fPatIdx++; // skip the URX_LOOP_C op.
4276 break;
4277 }
4278
4279 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
4280 // must follow. It's operand is the stack location
4281 // that holds the starting input index for the match of this [set]*
4282 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
4283 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
4284 int32_t stackLoc = URX_VAL(loopcOp);
4285 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
4286 fp->fExtra[stackLoc] = fp->fInputIdx;
4287 #ifdef REGEX_SMART_BACKTRACKING
4288 backSearchIndex = fp->fInputIdx;
4289 #endif
4290 fp->fInputIdx = ix;
4291
4292 // Save State to the URX_LOOP_C op that follows this one,
4293 // so that match failures in the following code will return to there.
4294 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
4295 fp = StateSave(fp, fp->fPatIdx, status);
4296 fp->fPatIdx++;
4297 }
4298 break;
4299
4300
4301 case URX_LOOP_DOT_I:
4302 // Loop Initialization for the optimized implementation of .*
4303 // This op scans through all remaining input.
4304 // The following LOOP_C op emulates stack unwinding if the following pattern fails.
4305 {
4306 // Loop through input until the input is exhausted (we reach an end-of-line)
4307 // In DOTALL mode, we can just go straight to the end of the input.
4308 int64_t ix;
4309 if ((opValue & 1) == 1) {
4310 // Dot-matches-All mode. Jump straight to the end of the string.
4311 ix = fActiveLimit;
4312 fHitEnd = TRUE;
4313 } else {
4314 // NOT DOT ALL mode. Line endings do not match '.'
4315 // Scan forward until a line ending or end of input.
4316 ix = fp->fInputIdx;
4317 UTEXT_SETNATIVEINDEX(fInputText, ix);
4318 for (;;) {
4319 if (ix >= fActiveLimit) {
4320 fHitEnd = TRUE;
4321 break;
4322 }
4323 UChar32 c = UTEXT_NEXT32(fInputText);
4324 if ((c & 0x7f) <= 0x29) { // Fast filter of non-new-line-s
4325 if ((c == 0x0a) || // 0x0a is newline in both modes.
4326 (((opValue & 2) == 0) && // IF not UNIX_LINES mode
4327 (c<=0x0d && c>=0x0a)) || c==0x85 ||c==0x2028 || c==0x2029) {
4328 // char is a line ending. Exit the scanning loop.
4329 break;
4330 }
4331 }
4332 ix = UTEXT_GETNATIVEINDEX(fInputText);
4333 }
4334 }
4335
4336 // If there were no matching characters, skip over the loop altogether.
4337 // The loop doesn't run at all, a * op always succeeds.
4338 if (ix == fp->fInputIdx) {
4339 fp->fPatIdx++; // skip the URX_LOOP_C op.
4340 break;
4341 }
4342
4343 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
4344 // must follow. It's operand is the stack location
4345 // that holds the starting input index for the match of this .*
4346 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
4347 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
4348 int32_t stackLoc = URX_VAL(loopcOp);
4349 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
4350 fp->fExtra[stackLoc] = fp->fInputIdx;
4351 #ifdef REGEX_SMART_BACKTRACKING
4352 backSearchIndex = fp->fInputIdx;
4353 #endif
4354 fp->fInputIdx = ix;
4355
4356 // Save State to the URX_LOOP_C op that follows this one,
4357 // so that match failures in the following code will return to there.
4358 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
4359 fp = StateSave(fp, fp->fPatIdx, status);
4360 fp->fPatIdx++;
4361 }
4362 break;
4363
4364
4365 case URX_LOOP_C:
4366 {
4367 U_ASSERT(opValue>=0 && opValue<fFrameSize);
4368 backSearchIndex = fp->fExtra[opValue];
4369 U_ASSERT(backSearchIndex <= fp->fInputIdx);
4370 if (backSearchIndex == fp->fInputIdx) {
4371 // We've backed up the input idx to the point that the loop started.
4372 // The loop is done. Leave here without saving state.
4373 // Subsequent failures won't come back here.
4374 break;
4375 }
4376 // Set up for the next iteration of the loop, with input index
4377 // backed up by one from the last time through,
4378 // and a state save to this instruction in case the following code fails again.
4379 // (We're going backwards because this loop emulates stack unwinding, not
4380 // the initial scan forward.)
4381 U_ASSERT(fp->fInputIdx > 0);
4382 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
4383 UChar32 prevC = UTEXT_PREVIOUS32(fInputText);
4384 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
4385
4386 UChar32 twoPrevC = UTEXT_PREVIOUS32(fInputText);
4387 if (prevC == 0x0a &&
4388 fp->fInputIdx > backSearchIndex &&
4389 twoPrevC == 0x0d) {
4390 int32_t prevOp = (int32_t)pat[fp->fPatIdx-2];
4391 if (URX_TYPE(prevOp) == URX_LOOP_DOT_I) {
4392 // .*, stepping back over CRLF pair.
4393 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
4394 }
4395 }
4396
4397
4398 fp = StateSave(fp, fp->fPatIdx-1, status);
4399 }
4400 break;
4401
4402
4403
4404 default:
4405 // Trouble. The compiled pattern contains an entry with an
4406 // unrecognized type tag.
4407 U_ASSERT(FALSE);
4408 }
4409
4410 if (U_FAILURE(status)) {
4411 isMatch = FALSE;
4412 break;
4413 }
4414 }
4415
4416 breakFromLoop:
4417 fMatch = isMatch;
4418 if (isMatch) {
4419 fLastMatchEnd = fMatchEnd;
4420 fMatchStart = startIdx;
4421 fMatchEnd = fp->fInputIdx;
4422 if (fTraceDebug) {
4423 REGEX_RUN_DEBUG_PRINTF(("Match. start=%d end=%d\n\n", fMatchStart, fMatchEnd));
4424 }
4425 }
4426 else
4427 {
4428 if (fTraceDebug) {
4429 REGEX_RUN_DEBUG_PRINTF(("No match\n\n"));
4430 }
4431 }
4432
4433 fFrame = fp; // The active stack frame when the engine stopped.
4434 // Contains the capture group results that we need to
4435 // access later.
4436 return;
4437 }
4438
4439
4440 //--------------------------------------------------------------------------------
4441 //
4442 // MatchChunkAt This is the actual matching engine. Like MatchAt, but with the
4443 // assumption that the entire string is available in the UText's
4444 // chunk buffer. For now, that means we can use int32_t indexes,
4445 // except for anything that needs to be saved (like group starts
4446 // and ends).
4447 //
4448 // startIdx: begin matching a this index.
4449 // toEnd: if true, match must extend to end of the input region
4450 //
4451 //--------------------------------------------------------------------------------
MatchChunkAt(int32_t startIdx,UBool toEnd,UErrorCode & status)4452 void RegexMatcher::MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &status) {
4453 UBool isMatch = FALSE; // True if the we have a match.
4454
4455 int32_t backSearchIndex = INT32_MAX; // used after greedy single-character matches for searching backwards
4456
4457 int32_t op; // Operation from the compiled pattern, split into
4458 int32_t opType; // the opcode
4459 int32_t opValue; // and the operand value.
4460
4461 #ifdef REGEX_RUN_DEBUG
4462 if (fTraceDebug)
4463 {
4464 printf("MatchAt(startIdx=%ld)\n", startIdx);
4465 printf("Original Pattern: ");
4466 UChar32 c = utext_next32From(fPattern->fPattern, 0);
4467 while (c != U_SENTINEL) {
4468 if (c<32 || c>256) {
4469 c = '.';
4470 }
4471 REGEX_DUMP_DEBUG_PRINTF(("%c", c));
4472
4473 c = UTEXT_NEXT32(fPattern->fPattern);
4474 }
4475 printf("\n");
4476 printf("Input String: ");
4477 c = utext_next32From(fInputText, 0);
4478 while (c != U_SENTINEL) {
4479 if (c<32 || c>256) {
4480 c = '.';
4481 }
4482 printf("%c", c);
4483
4484 c = UTEXT_NEXT32(fInputText);
4485 }
4486 printf("\n");
4487 printf("\n");
4488 }
4489 #endif
4490
4491 if (U_FAILURE(status)) {
4492 return;
4493 }
4494
4495 // Cache frequently referenced items from the compiled pattern
4496 //
4497 int64_t *pat = fPattern->fCompiledPat->getBuffer();
4498
4499 const UChar *litText = fPattern->fLiteralText.getBuffer();
4500 UVector *sets = fPattern->fSets;
4501
4502 const UChar *inputBuf = fInputText->chunkContents;
4503
4504 fFrameSize = fPattern->fFrameSize;
4505 REStackFrame *fp = resetStack();
4506
4507 fp->fPatIdx = 0;
4508 fp->fInputIdx = startIdx;
4509
4510 // Zero out the pattern's static data
4511 int32_t i;
4512 for (i = 0; i<fPattern->fDataSize; i++) {
4513 fData[i] = 0;
4514 }
4515
4516 //
4517 // Main loop for interpreting the compiled pattern.
4518 // One iteration of the loop per pattern operation performed.
4519 //
4520 for (;;) {
4521 #if 0
4522 if (_heapchk() != _HEAPOK) {
4523 fprintf(stderr, "Heap Trouble\n");
4524 }
4525 #endif
4526
4527 op = (int32_t)pat[fp->fPatIdx];
4528 opType = URX_TYPE(op);
4529 opValue = URX_VAL(op);
4530 #ifdef REGEX_RUN_DEBUG
4531 if (fTraceDebug) {
4532 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
4533 printf("inputIdx=%d inputChar=%x sp=%3d activeLimit=%d ", fp->fInputIdx,
4534 UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(), fActiveLimit);
4535 fPattern->dumpOp(fp->fPatIdx);
4536 }
4537 #endif
4538 fp->fPatIdx++;
4539
4540 switch (opType) {
4541
4542
4543 case URX_NOP:
4544 break;
4545
4546
4547 case URX_BACKTRACK:
4548 // Force a backtrack. In some circumstances, the pattern compiler
4549 // will notice that the pattern can't possibly match anything, and will
4550 // emit one of these at that point.
4551 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4552 break;
4553
4554
4555 case URX_ONECHAR:
4556 if (fp->fInputIdx < fActiveLimit) {
4557 UChar32 c;
4558 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4559 if (c == opValue) {
4560 break;
4561 }
4562 } else {
4563 fHitEnd = TRUE;
4564 }
4565
4566 #ifdef REGEX_SMART_BACKTRACKING
4567 if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) {
4568 REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
4569 if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
4570 int64_t reverseIndex = fp->fInputIdx;
4571 UChar32 c;
4572 do {
4573 U16_PREV(inputBuf, backSearchIndex, reverseIndex, c);
4574 if (c == opValue) {
4575 break;
4576 }
4577 } while (reverseIndex > backSearchIndex);
4578 if (c == opValue) {
4579 fHitEnd = FALSE;
4580 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4581 fp->fInputIdx = reverseIndex;
4582 if (fp->fInputIdx > backSearchIndex) {
4583 fp = StateSave(fp, fp->fPatIdx, status);
4584 }
4585 fp->fPatIdx++; // Skip the LOOP_C, we just did that
4586 break;
4587 }
4588 }
4589 }
4590 #endif
4591
4592 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4593 break;
4594
4595
4596 case URX_STRING:
4597 {
4598 // Test input against a literal string.
4599 // Strings require two slots in the compiled pattern, one for the
4600 // offset to the string text, and one for the length.
4601 int32_t stringStartIdx = opValue;
4602 int32_t stringLen;
4603
4604 op = (int32_t)pat[fp->fPatIdx]; // Fetch the second operand
4605 fp->fPatIdx++;
4606 opType = URX_TYPE(op);
4607 stringLen = URX_VAL(op);
4608 U_ASSERT(opType == URX_STRING_LEN);
4609 U_ASSERT(stringLen >= 2);
4610
4611 if (fp->fInputIdx + stringLen > fActiveLimit) {
4612 // No match. String is longer than the remaining input text.
4613 fHitEnd = TRUE; // TODO: See ticket 6074
4614 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4615 break;
4616 }
4617
4618 const UChar * pInp = inputBuf + fp->fInputIdx;
4619 const UChar * pPat = litText+stringStartIdx;
4620 const UChar * pEnd = pInp + stringLen;
4621 UBool success = FALSE;
4622 for(;;) {
4623 if (*pInp == *pPat) {
4624 pInp++;
4625 pPat++;
4626 if (pInp == pEnd) {
4627 // Successful Match.
4628 success = TRUE;
4629 break;
4630 }
4631 } else {
4632 // Match failed.
4633 break;
4634 }
4635 }
4636
4637 if (success) {
4638 fp->fInputIdx += stringLen;
4639 } else {
4640 #ifdef REGEX_SMART_BACKTRACKING
4641 if (fp->fInputIdx > backSearchIndex && fStack->size()) {
4642 REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
4643 if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
4644 // Reset to last start point
4645 int64_t reverseIndex = fp->fInputIdx;
4646 UChar32 c;
4647 pPat = litText+stringStartIdx;
4648
4649 // Search backwards for a possible start
4650 do {
4651 U16_PREV(inputBuf, backSearchIndex, reverseIndex, c);
4652 if ((U_IS_BMP(c) && *pPat == c) ||
4653 (*pPat == U16_LEAD(c) && *(pPat+1) == U16_TRAIL(c))) {
4654 success = TRUE;
4655 break;
4656 }
4657 } while (reverseIndex > backSearchIndex);
4658
4659 // And try again
4660 if (success) {
4661 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4662 fp->fInputIdx = reverseIndex;
4663 if (fp->fInputIdx > backSearchIndex) {
4664 fp = StateSave(fp, fp->fPatIdx, status);
4665 }
4666 fp->fPatIdx++; // Skip the LOOP_C, we just did that
4667 break;
4668 }
4669 }
4670 }
4671 #endif
4672 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4673 }
4674 }
4675 break;
4676
4677
4678 case URX_STATE_SAVE:
4679 fp = StateSave(fp, opValue, status);
4680 break;
4681
4682
4683 case URX_END:
4684 // The match loop will exit via this path on a successful match,
4685 // when we reach the end of the pattern.
4686 if (toEnd && fp->fInputIdx != fActiveLimit) {
4687 // The pattern matched, but not to the end of input. Try some more.
4688 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4689 break;
4690 }
4691 isMatch = TRUE;
4692 goto breakFromLoop;
4693
4694 // Start and End Capture stack frame variables are laid out out like this:
4695 // fp->fExtra[opValue] - The start of a completed capture group
4696 // opValue+1 - The end of a completed capture group
4697 // opValue+2 - the start of a capture group whose end
4698 // has not yet been reached (and might not ever be).
4699 case URX_START_CAPTURE:
4700 U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
4701 fp->fExtra[opValue+2] = fp->fInputIdx;
4702 break;
4703
4704
4705 case URX_END_CAPTURE:
4706 U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
4707 U_ASSERT(fp->fExtra[opValue+2] >= 0); // Start pos for this group must be set.
4708 fp->fExtra[opValue] = fp->fExtra[opValue+2]; // Tentative start becomes real.
4709 fp->fExtra[opValue+1] = fp->fInputIdx; // End position
4710 U_ASSERT(fp->fExtra[opValue] <= fp->fExtra[opValue+1]);
4711 break;
4712
4713
4714 case URX_DOLLAR: // $, test for End of line
4715 // or for position before new line at end of input
4716 if (fp->fInputIdx < fAnchorLimit-2) {
4717 // We are no where near the end of input. Fail.
4718 // This is the common case. Keep it first.
4719 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4720 break;
4721 }
4722 if (fp->fInputIdx >= fAnchorLimit) {
4723 // We really are at the end of input. Success.
4724 fHitEnd = TRUE;
4725 fRequireEnd = TRUE;
4726 break;
4727 }
4728
4729 // If we are positioned just before a new-line that is located at the
4730 // end of input, succeed.
4731 if (fp->fInputIdx == fAnchorLimit-1) {
4732 UChar32 c;
4733 U16_GET(inputBuf, fAnchorStart, fp->fInputIdx, fAnchorLimit, c);
4734
4735 if ((c>=0x0a && c<=0x0d) || c==0x85 || c==0x2028 || c==0x2029) {
4736 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp->fInputIdx-1]==0x0d)) {
4737 // At new-line at end of input. Success
4738 fHitEnd = TRUE;
4739 fRequireEnd = TRUE;
4740 break;
4741 }
4742 }
4743 } else if (fp->fInputIdx == fAnchorLimit-2 &&
4744 inputBuf[fp->fInputIdx]==0x0d && inputBuf[fp->fInputIdx+1]==0x0a) {
4745 fHitEnd = TRUE;
4746 fRequireEnd = TRUE;
4747 break; // At CR/LF at end of input. Success
4748 }
4749
4750 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4751
4752 break;
4753
4754
4755 case URX_DOLLAR_D: // $, test for End of Line, in UNIX_LINES mode.
4756 if (fp->fInputIdx >= fAnchorLimit-1) {
4757 // Either at the last character of input, or off the end.
4758 if (fp->fInputIdx == fAnchorLimit-1) {
4759 // At last char of input. Success if it's a new line.
4760 if (inputBuf[fp->fInputIdx] == 0x0a) {
4761 fHitEnd = TRUE;
4762 fRequireEnd = TRUE;
4763 break;
4764 }
4765 } else {
4766 // Off the end of input. Success.
4767 fHitEnd = TRUE;
4768 fRequireEnd = TRUE;
4769 break;
4770 }
4771 }
4772
4773 // Not at end of input. Back-track out.
4774 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4775 break;
4776
4777
4778 case URX_DOLLAR_M: // $, test for End of line in multi-line mode
4779 {
4780 if (fp->fInputIdx >= fAnchorLimit) {
4781 // We really are at the end of input. Success.
4782 fHitEnd = TRUE;
4783 fRequireEnd = TRUE;
4784 break;
4785 }
4786 // If we are positioned just before a new-line, succeed.
4787 // It makes no difference where the new-line is within the input.
4788 UChar32 c = inputBuf[fp->fInputIdx];
4789 if ((c>=0x0a && c<=0x0d) || c==0x85 ||c==0x2028 || c==0x2029) {
4790 // At a line end, except for the odd chance of being in the middle of a CR/LF sequence
4791 // In multi-line mode, hitting a new-line just before the end of input does not
4792 // set the hitEnd or requireEnd flags
4793 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp->fInputIdx-1]==0x0d)) {
4794 break;
4795 }
4796 }
4797 // not at a new line. Fail.
4798 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4799 }
4800 break;
4801
4802
4803 case URX_DOLLAR_MD: // $, test for End of line in multi-line and UNIX_LINES mode
4804 {
4805 if (fp->fInputIdx >= fAnchorLimit) {
4806 // We really are at the end of input. Success.
4807 fHitEnd = TRUE;
4808 fRequireEnd = TRUE; // Java set requireEnd in this case, even though
4809 break; // adding a new-line would not lose the match.
4810 }
4811 // If we are not positioned just before a new-line, the test fails; backtrack out.
4812 // It makes no difference where the new-line is within the input.
4813 if (inputBuf[fp->fInputIdx] != 0x0a) {
4814 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4815 }
4816 }
4817 break;
4818
4819
4820 case URX_CARET: // ^, test for start of line
4821 if (fp->fInputIdx != fAnchorStart) {
4822 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4823 }
4824 break;
4825
4826
4827 case URX_CARET_M: // ^, test for start of line in mulit-line mode
4828 {
4829 if (fp->fInputIdx == fAnchorStart) {
4830 // We are at the start input. Success.
4831 break;
4832 }
4833 // Check whether character just before the current pos is a new-line
4834 // unless we are at the end of input
4835 UChar c = inputBuf[fp->fInputIdx - 1];
4836 if ((fp->fInputIdx < fAnchorLimit) &&
4837 ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) {
4838 // It's a new-line. ^ is true. Success.
4839 // TODO: what should be done with positions between a CR and LF?
4840 break;
4841 }
4842 // Not at the start of a line. Fail.
4843 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4844 }
4845 break;
4846
4847
4848 case URX_CARET_M_UNIX: // ^, test for start of line in mulit-line + Unix-line mode
4849 {
4850 U_ASSERT(fp->fInputIdx >= fAnchorStart);
4851 if (fp->fInputIdx <= fAnchorStart) {
4852 // We are at the start input. Success.
4853 break;
4854 }
4855 // Check whether character just before the current pos is a new-line
4856 U_ASSERT(fp->fInputIdx <= fAnchorLimit);
4857 UChar c = inputBuf[fp->fInputIdx - 1];
4858 if (c != 0x0a) {
4859 // Not at the start of a line. Back-track out.
4860 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4861 }
4862 }
4863 break;
4864
4865 case URX_BACKSLASH_B: // Test for word boundaries
4866 {
4867 UBool success = isChunkWordBoundary((int32_t)fp->fInputIdx);
4868 success ^= (opValue != 0); // flip sense for \B
4869 if (!success) {
4870 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4871 }
4872 }
4873 break;
4874
4875
4876 case URX_BACKSLASH_BU: // Test for word boundaries, Unicode-style
4877 {
4878 UBool success = isUWordBoundary(fp->fInputIdx);
4879 success ^= (opValue != 0); // flip sense for \B
4880 if (!success) {
4881 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4882 }
4883 }
4884 break;
4885
4886
4887 case URX_BACKSLASH_D: // Test for decimal digit
4888 {
4889 if (fp->fInputIdx >= fActiveLimit) {
4890 fHitEnd = TRUE;
4891 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4892 break;
4893 }
4894
4895 UChar32 c;
4896 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4897 int8_t ctype = u_charType(c); // TODO: make a unicode set for this. Will be faster.
4898 UBool success = (ctype == U_DECIMAL_DIGIT_NUMBER);
4899 success ^= (opValue != 0); // flip sense for \D
4900 if (!success) {
4901 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4902 }
4903 }
4904 break;
4905
4906
4907 case URX_BACKSLASH_G: // Test for position at end of previous match
4908 if (!((fMatch && fp->fInputIdx==fMatchEnd) || (fMatch==FALSE && fp->fInputIdx==fActiveStart))) {
4909 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4910 }
4911 break;
4912
4913
4914 case URX_BACKSLASH_X:
4915 // Match a Grapheme, as defined by Unicode TR 29.
4916 // Differs slightly from Perl, which consumes combining marks independently
4917 // of context.
4918 {
4919
4920 // Fail if at end of input
4921 if (fp->fInputIdx >= fActiveLimit) {
4922 fHitEnd = TRUE;
4923 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4924 break;
4925 }
4926
4927 // Examine (and consume) the current char.
4928 // Dispatch into a little state machine, based on the char.
4929 UChar32 c;
4930 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4931 UnicodeSet **sets = fPattern->fStaticSets;
4932 if (sets[URX_GC_NORMAL]->contains(c)) goto GC_Extend;
4933 if (sets[URX_GC_CONTROL]->contains(c)) goto GC_Control;
4934 if (sets[URX_GC_L]->contains(c)) goto GC_L;
4935 if (sets[URX_GC_LV]->contains(c)) goto GC_V;
4936 if (sets[URX_GC_LVT]->contains(c)) goto GC_T;
4937 if (sets[URX_GC_V]->contains(c)) goto GC_V;
4938 if (sets[URX_GC_T]->contains(c)) goto GC_T;
4939 goto GC_Extend;
4940
4941
4942
4943 GC_L:
4944 if (fp->fInputIdx >= fActiveLimit) goto GC_Done;
4945 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4946 if (sets[URX_GC_L]->contains(c)) goto GC_L;
4947 if (sets[URX_GC_LV]->contains(c)) goto GC_V;
4948 if (sets[URX_GC_LVT]->contains(c)) goto GC_T;
4949 if (sets[URX_GC_V]->contains(c)) goto GC_V;
4950 U16_PREV(inputBuf, 0, fp->fInputIdx, c);
4951 goto GC_Extend;
4952
4953 GC_V:
4954 if (fp->fInputIdx >= fActiveLimit) goto GC_Done;
4955 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4956 if (sets[URX_GC_V]->contains(c)) goto GC_V;
4957 if (sets[URX_GC_T]->contains(c)) goto GC_T;
4958 U16_PREV(inputBuf, 0, fp->fInputIdx, c);
4959 goto GC_Extend;
4960
4961 GC_T:
4962 if (fp->fInputIdx >= fActiveLimit) goto GC_Done;
4963 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4964 if (sets[URX_GC_T]->contains(c)) goto GC_T;
4965 U16_PREV(inputBuf, 0, fp->fInputIdx, c);
4966 goto GC_Extend;
4967
4968 GC_Extend:
4969 // Combining characters are consumed here
4970 for (;;) {
4971 if (fp->fInputIdx >= fActiveLimit) {
4972 break;
4973 }
4974 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4975 if (sets[URX_GC_EXTEND]->contains(c) == FALSE) {
4976 U16_BACK_1(inputBuf, 0, fp->fInputIdx);
4977 break;
4978 }
4979 }
4980 goto GC_Done;
4981
4982 GC_Control:
4983 // Most control chars stand alone (don't combine with combining chars),
4984 // except for that CR/LF sequence is a single grapheme cluster.
4985 if (c == 0x0d && fp->fInputIdx < fActiveLimit && inputBuf[fp->fInputIdx] == 0x0a) {
4986 fp->fInputIdx++;
4987 }
4988
4989 GC_Done:
4990 if (fp->fInputIdx >= fActiveLimit) {
4991 fHitEnd = TRUE;
4992 }
4993 break;
4994 }
4995
4996
4997
4998
4999 case URX_BACKSLASH_Z: // Test for end of Input
5000 if (fp->fInputIdx < fAnchorLimit) {
5001 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5002 } else {
5003 fHitEnd = TRUE;
5004 fRequireEnd = TRUE;
5005 }
5006 break;
5007
5008
5009
5010 case URX_STATIC_SETREF:
5011 {
5012 // Test input character against one of the predefined sets
5013 // (Word Characters, for example)
5014 // The high bit of the op value is a flag for the match polarity.
5015 // 0: success if input char is in set.
5016 // 1: success if input char is not in set.
5017 if (fp->fInputIdx >= fActiveLimit) {
5018 fHitEnd = TRUE;
5019 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5020 break;
5021 }
5022
5023 UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET);
5024 opValue &= ~URX_NEG_SET;
5025 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
5026
5027 UChar32 c;
5028 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
5029 if (c < 256) {
5030 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
5031 if (s8->contains(c)) {
5032 success = !success;
5033 }
5034 } else {
5035 const UnicodeSet *s = fPattern->fStaticSets[opValue];
5036 if (s->contains(c)) {
5037 success = !success;
5038 }
5039 }
5040 if (!success) {
5041 #ifdef REGEX_SMART_BACKTRACKING
5042 if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) {
5043 REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
5044 if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
5045 // Try to find it, backwards
5046 int64_t reverseIndex = fp->fInputIdx;
5047 U16_BACK_1(inputBuf, backSearchIndex, reverseIndex); // skip the first character we tried
5048 success = ((opValue & URX_NEG_SET) == URX_NEG_SET); // reset
5049 do {
5050 U16_PREV(inputBuf, backSearchIndex, reverseIndex, c);
5051 if (c < 256) {
5052 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
5053 if (s8->contains(c)) {
5054 success = !success;
5055 }
5056 } else {
5057 const UnicodeSet *s = fPattern->fStaticSets[opValue];
5058 if (s->contains(c)) {
5059 success = !success;
5060 }
5061 }
5062 } while (reverseIndex > backSearchIndex && !success);
5063
5064 if (success) {
5065 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5066 fp->fInputIdx = reverseIndex;
5067 if (fp->fInputIdx > backSearchIndex) {
5068 fp = StateSave(fp, fp->fPatIdx, status);
5069 }
5070 fp->fPatIdx++; // Skip the LOOP_C, we just did that
5071 break;
5072 }
5073 }
5074 }
5075 #endif
5076 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5077 }
5078 }
5079 break;
5080
5081
5082 case URX_STAT_SETREF_N:
5083 {
5084 // Test input character for NOT being a member of one of
5085 // the predefined sets (Word Characters, for example)
5086 if (fp->fInputIdx >= fActiveLimit) {
5087 fHitEnd = TRUE;
5088 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5089 break;
5090 }
5091
5092 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
5093
5094 UChar32 c;
5095 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
5096 if (c < 256) {
5097 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
5098 if (s8->contains(c) == FALSE) {
5099 break;
5100 }
5101 } else {
5102 const UnicodeSet *s = fPattern->fStaticSets[opValue];
5103 if (s->contains(c) == FALSE) {
5104 break;
5105 }
5106 }
5107
5108 #ifdef REGEX_SMART_BACKTRACKING
5109 if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) {
5110 REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
5111 if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
5112 // Try to find it, backwards
5113 int64_t reverseIndex = fp->fInputIdx;
5114 U16_BACK_1(inputBuf, backSearchIndex, reverseIndex); // skip the first character we tried
5115 UBool success = FALSE;
5116 do {
5117 U16_PREV(inputBuf, backSearchIndex, reverseIndex, c);
5118 if (c < 256) {
5119 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
5120 if (s8->contains(c) == FALSE) {
5121 success = TRUE;
5122 break;
5123 }
5124 } else {
5125 const UnicodeSet *s = fPattern->fStaticSets[opValue];
5126 if (s->contains(c) == FALSE) {
5127 success = TRUE;
5128 break;
5129 }
5130 }
5131 } while (reverseIndex > backSearchIndex);
5132
5133 if (success) {
5134 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5135 fp->fInputIdx = reverseIndex;
5136 if (fp->fInputIdx > backSearchIndex) {
5137 fp = StateSave(fp, fp->fPatIdx, status);
5138 }
5139 fp->fPatIdx++; // Skip the LOOP_C, we just did that
5140 break;
5141 }
5142 }
5143 }
5144 #endif
5145 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5146 }
5147 break;
5148
5149
5150 case URX_SETREF:
5151 {
5152 if (fp->fInputIdx >= fActiveLimit) {
5153 fHitEnd = TRUE;
5154 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5155 break;
5156 }
5157
5158 U_ASSERT(opValue > 0 && opValue < sets->size());
5159
5160 // There is input left. Pick up one char and test it for set membership.
5161 UChar32 c;
5162 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
5163 if (c<256) {
5164 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
5165 if (s8->contains(c)) {
5166 // The character is in the set. A Match.
5167 break;
5168 }
5169 } else {
5170 UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue);
5171 if (s->contains(c)) {
5172 // The character is in the set. A Match.
5173 break;
5174 }
5175 }
5176
5177 // the character wasn't in the set.
5178 #ifdef REGEX_SMART_BACKTRACKING
5179 if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) {
5180 REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
5181 if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
5182 // Try to find it, backwards
5183 int64_t reverseIndex = fp->fInputIdx;
5184 U16_BACK_1(inputBuf, backSearchIndex, reverseIndex); // skip the first character we tried
5185 UBool success = FALSE;
5186 do {
5187 U16_PREV(inputBuf, backSearchIndex, reverseIndex, c);
5188 if (c < 256) {
5189 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
5190 if (s8->contains(c)) {
5191 success = TRUE;
5192 break;
5193 }
5194 } else {
5195 UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue);
5196 if (s->contains(c)) {
5197 success = TRUE;
5198 break;
5199 }
5200 }
5201 } while (reverseIndex > backSearchIndex);
5202
5203 if (success) {
5204 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5205 fp->fInputIdx = reverseIndex;
5206 if (fp->fInputIdx > reverseIndex) {
5207 fp = StateSave(fp, fp->fPatIdx, status);
5208 }
5209 fp->fPatIdx++; // Skip the LOOP_C, we just did that
5210 break;
5211 }
5212 }
5213 }
5214 #endif
5215 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5216 }
5217 break;
5218
5219
5220 case URX_DOTANY:
5221 {
5222 // . matches anything, but stops at end-of-line.
5223 if (fp->fInputIdx >= fActiveLimit) {
5224 // At end of input. Match failed. Backtrack out.
5225 fHitEnd = TRUE;
5226 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5227 break;
5228 }
5229
5230 // There is input left. Advance over one char, unless we've hit end-of-line
5231 UChar32 c;
5232 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
5233 if (((c & 0x7f) <= 0x29) && // First quickly bypass as many chars as possible
5234 ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) {
5235 // End of line in normal mode. . does not match.
5236 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5237 break;
5238 }
5239 }
5240 break;
5241
5242
5243 case URX_DOTANY_ALL:
5244 {
5245 // . in dot-matches-all (including new lines) mode
5246 if (fp->fInputIdx >= fActiveLimit) {
5247 // At end of input. Match failed. Backtrack out.
5248 fHitEnd = TRUE;
5249 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5250 break;
5251 }
5252
5253 // There is input left. Advance over one char, except if we are
5254 // at a cr/lf, advance over both of them.
5255 UChar32 c;
5256 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
5257 if (c==0x0d && fp->fInputIdx < fActiveLimit) {
5258 // In the case of a CR/LF, we need to advance over both.
5259 if (inputBuf[fp->fInputIdx] == 0x0a) {
5260 U16_FWD_1(inputBuf, fp->fInputIdx, fActiveLimit);
5261 }
5262 }
5263 }
5264 break;
5265
5266
5267 case URX_DOTANY_UNIX:
5268 {
5269 // '.' operator, matches all, but stops at end-of-line.
5270 // UNIX_LINES mode, so 0x0a is the only recognized line ending.
5271 if (fp->fInputIdx >= fActiveLimit) {
5272 // At end of input. Match failed. Backtrack out.
5273 fHitEnd = TRUE;
5274 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5275 break;
5276 }
5277
5278 // There is input left. Advance over one char, unless we've hit end-of-line
5279 UChar32 c;
5280 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
5281 if (c == 0x0a) {
5282 // End of line in normal mode. '.' does not match the \n
5283 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5284 }
5285 }
5286 break;
5287
5288
5289 case URX_JMP:
5290 fp->fPatIdx = opValue;
5291 break;
5292
5293 case URX_FAIL:
5294 isMatch = FALSE;
5295 goto breakFromLoop;
5296
5297 case URX_JMP_SAV:
5298 U_ASSERT(opValue < fPattern->fCompiledPat->size());
5299 fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current
5300 fp->fPatIdx = opValue; // Then JMP.
5301 break;
5302
5303 case URX_JMP_SAV_X:
5304 // This opcode is used with (x)+, when x can match a zero length string.
5305 // Same as JMP_SAV, except conditional on the match having made forward progress.
5306 // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the
5307 // data address of the input position at the start of the loop.
5308 {
5309 U_ASSERT(opValue > 0 && opValue < fPattern->fCompiledPat->size());
5310 int32_t stoOp = (int32_t)pat[opValue-1];
5311 U_ASSERT(URX_TYPE(stoOp) == URX_STO_INP_LOC);
5312 int32_t frameLoc = URX_VAL(stoOp);
5313 U_ASSERT(frameLoc >= 0 && frameLoc < fFrameSize);
5314 int32_t prevInputIdx = (int32_t)fp->fExtra[frameLoc];
5315 U_ASSERT(prevInputIdx <= fp->fInputIdx);
5316 if (prevInputIdx < fp->fInputIdx) {
5317 // The match did make progress. Repeat the loop.
5318 fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current
5319 fp->fPatIdx = opValue;
5320 fp->fExtra[frameLoc] = fp->fInputIdx;
5321 }
5322 // If the input position did not advance, we do nothing here,
5323 // execution will fall out of the loop.
5324 }
5325 break;
5326
5327 case URX_CTR_INIT:
5328 {
5329 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
5330 fp->fExtra[opValue] = 0; // Set the loop counter variable to zero
5331
5332 // Pick up the three extra operands that CTR_INIT has, and
5333 // skip the pattern location counter past
5334 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
5335 fp->fPatIdx += 3;
5336 int32_t loopLoc = URX_VAL(pat[instrOperandLoc]);
5337 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
5338 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
5339 U_ASSERT(minCount>=0);
5340 U_ASSERT(maxCount>=minCount || maxCount==-1);
5341 U_ASSERT(loopLoc>fp->fPatIdx);
5342
5343 if (minCount == 0) {
5344 fp = StateSave(fp, loopLoc+1, status);
5345 }
5346 if (maxCount == 0) {
5347 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5348 }
5349 }
5350 break;
5351
5352 case URX_CTR_LOOP:
5353 {
5354 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
5355 int32_t initOp = (int32_t)pat[opValue];
5356 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT);
5357 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
5358 int32_t minCount = (int32_t)pat[opValue+2];
5359 int32_t maxCount = (int32_t)pat[opValue+3];
5360 // Increment the counter. Note: we DIDN'T worry about counter
5361 // overflow, since the data comes from UnicodeStrings, which
5362 // stores its length in an int32_t. Do we have to think about
5363 // this now that we're using UText? Probably not, since the length
5364 // in UChar32s is still an int32_t.
5365 (*pCounter)++;
5366 U_ASSERT(*pCounter > 0);
5367 if ((uint64_t)*pCounter >= (uint32_t)maxCount) {
5368 U_ASSERT(*pCounter == maxCount || maxCount == -1);
5369 break;
5370 }
5371 if (*pCounter >= minCount) {
5372 fp = StateSave(fp, fp->fPatIdx, status);
5373 }
5374 fp->fPatIdx = opValue + 4; // Loop back.
5375 }
5376 break;
5377
5378 case URX_CTR_INIT_NG:
5379 {
5380 // Initialize a non-greedy loop
5381 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
5382 fp->fExtra[opValue] = 0; // Set the loop counter variable to zero
5383
5384 // Pick up the three extra operands that CTR_INIT has, and
5385 // skip the pattern location counter past
5386 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
5387 fp->fPatIdx += 3;
5388 int32_t loopLoc = URX_VAL(pat[instrOperandLoc]);
5389 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
5390 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
5391 U_ASSERT(minCount>=0);
5392 U_ASSERT(maxCount>=minCount || maxCount==-1);
5393 U_ASSERT(loopLoc>fp->fPatIdx);
5394
5395 if (minCount == 0) {
5396 if (maxCount != 0) {
5397 fp = StateSave(fp, fp->fPatIdx, status);
5398 }
5399 fp->fPatIdx = loopLoc+1; // Continue with stuff after repeated block
5400 }
5401 }
5402 break;
5403
5404 case URX_CTR_LOOP_NG:
5405 {
5406 // Non-greedy {min, max} loops
5407 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
5408 int32_t initOp = (int32_t)pat[opValue];
5409 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT_NG);
5410 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
5411 int32_t minCount = (int32_t)pat[opValue+2];
5412 int32_t maxCount = (int32_t)pat[opValue+3];
5413 // Increment the counter. Note: we DIDN'T worry about counter
5414 // overflow, since the data comes from UnicodeStrings, which
5415 // stores its length in an int32_t. Do we have to think about
5416 // this now that we're using UText? Probably not, since the length
5417 // in UChar32s is still an int32_t.
5418 (*pCounter)++;
5419 U_ASSERT(*pCounter > 0);
5420
5421 if ((uint64_t)*pCounter >= (uint32_t)maxCount) {
5422 // The loop has matched the maximum permitted number of times.
5423 // Break out of here with no action. Matching will
5424 // continue with the following pattern.
5425 U_ASSERT(*pCounter == maxCount || maxCount == -1);
5426 break;
5427 }
5428
5429 if (*pCounter < minCount) {
5430 // We haven't met the minimum number of matches yet.
5431 // Loop back for another one.
5432 fp->fPatIdx = opValue + 4; // Loop back.
5433 } else {
5434 // We do have the minimum number of matches.
5435 // Fall into the following pattern, but first do
5436 // a state save to the top of the loop, so that a failure
5437 // in the following pattern will try another iteration of the loop.
5438 fp = StateSave(fp, opValue + 4, status);
5439 }
5440 }
5441 break;
5442
5443 case URX_STO_SP:
5444 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
5445 fData[opValue] = fStack->size();
5446 break;
5447
5448 case URX_LD_SP:
5449 {
5450 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
5451 int32_t newStackSize = (int32_t)fData[opValue];
5452 U_ASSERT(newStackSize <= fStack->size());
5453 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
5454 if (newFP == (int64_t *)fp) {
5455 break;
5456 }
5457 int32_t i;
5458 for (i=0; i<fFrameSize; i++) {
5459 newFP[i] = ((int64_t *)fp)[i];
5460 }
5461 fp = (REStackFrame *)newFP;
5462 fStack->setSize(newStackSize);
5463 }
5464 break;
5465
5466 case URX_BACKREF:
5467 case URX_BACKREF_I:
5468 {
5469 U_ASSERT(opValue < fFrameSize);
5470 int64_t groupStartIdx = fp->fExtra[opValue];
5471 int64_t groupEndIdx = fp->fExtra[opValue+1];
5472 U_ASSERT(groupStartIdx <= groupEndIdx);
5473 int64_t len = groupEndIdx-groupStartIdx;
5474 if (groupStartIdx < 0) {
5475 // This capture group has not participated in the match thus far,
5476 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match.
5477 }
5478
5479 if (len == 0) {
5480 // The capture group match was of an empty string.
5481 // Verified by testing: Perl matches succeed in this case, so
5482 // we do too.
5483 break;
5484 }
5485
5486 UBool haveMatch = FALSE;
5487 if (fp->fInputIdx + len <= fActiveLimit) {
5488 if (opType == URX_BACKREF) {
5489 if (u_strncmp(inputBuf+groupStartIdx, inputBuf+fp->fInputIdx, (int32_t)len) == 0) {
5490 haveMatch = TRUE;
5491 }
5492 } else {
5493 if (u_strncasecmp(inputBuf+groupStartIdx, inputBuf+fp->fInputIdx,
5494 (int32_t)len, U_FOLD_CASE_DEFAULT) == 0) {
5495 haveMatch = TRUE;
5496 }
5497 }
5498 } else {
5499 // TODO: probably need to do a partial string comparison, and only
5500 // set HitEnd if the available input matched. Ticket #6074
5501 fHitEnd = TRUE;
5502 }
5503 if (haveMatch) {
5504 fp->fInputIdx += len; // Match. Advance current input position.
5505 } else {
5506 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match.
5507 }
5508 }
5509 break;
5510
5511 case URX_STO_INP_LOC:
5512 {
5513 U_ASSERT(opValue >= 0 && opValue < fFrameSize);
5514 fp->fExtra[opValue] = fp->fInputIdx;
5515 }
5516 break;
5517
5518 case URX_JMPX:
5519 {
5520 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
5521 fp->fPatIdx += 1;
5522 int32_t dataLoc = URX_VAL(pat[instrOperandLoc]);
5523 U_ASSERT(dataLoc >= 0 && dataLoc < fFrameSize);
5524 int32_t savedInputIdx = (int32_t)fp->fExtra[dataLoc];
5525 U_ASSERT(savedInputIdx <= fp->fInputIdx);
5526 if (savedInputIdx < fp->fInputIdx) {
5527 fp->fPatIdx = opValue; // JMP
5528 } else {
5529 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no progress in loop.
5530 }
5531 }
5532 break;
5533
5534 case URX_LA_START:
5535 {
5536 // Entering a lookahead block.
5537 // Save Stack Ptr, Input Pos.
5538 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5539 fData[opValue] = fStack->size();
5540 fData[opValue+1] = fp->fInputIdx;
5541 fActiveStart = fLookStart; // Set the match region change for
5542 fActiveLimit = fLookLimit; // transparent bounds.
5543 }
5544 break;
5545
5546 case URX_LA_END:
5547 {
5548 // Leaving a look-ahead block.
5549 // restore Stack Ptr, Input Pos to positions they had on entry to block.
5550 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5551 int32_t stackSize = fStack->size();
5552 int32_t newStackSize = (int32_t)fData[opValue];
5553 U_ASSERT(stackSize >= newStackSize);
5554 if (stackSize > newStackSize) {
5555 // Copy the current top frame back to the new (cut back) top frame.
5556 // This makes the capture groups from within the look-ahead
5557 // expression available.
5558 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
5559 int32_t i;
5560 for (i=0; i<fFrameSize; i++) {
5561 newFP[i] = ((int64_t *)fp)[i];
5562 }
5563 fp = (REStackFrame *)newFP;
5564 fStack->setSize(newStackSize);
5565 }
5566 fp->fInputIdx = fData[opValue+1];
5567
5568 // Restore the active region bounds in the input string; they may have
5569 // been changed because of transparent bounds on a Region.
5570 fActiveStart = fRegionStart;
5571 fActiveLimit = fRegionLimit;
5572 }
5573 break;
5574
5575 case URX_ONECHAR_I:
5576 if (fp->fInputIdx < fActiveLimit) {
5577 UChar32 c;
5578 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
5579 if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) {
5580 break;
5581 }
5582 } else {
5583 fHitEnd = TRUE;
5584 }
5585
5586 #ifdef REGEX_SMART_BACKTRACKING
5587 if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) {
5588 REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
5589 if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
5590 UBool success = FALSE;
5591 int64_t reverseIndex = fp->fInputIdx;
5592 UChar32 c;
5593 while (reverseIndex > backSearchIndex) {
5594 U16_PREV(inputBuf, backSearchIndex, reverseIndex, c);
5595 if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) {
5596 success = TRUE;
5597 break;
5598 } else if (c == U_SENTINEL) {
5599 break;
5600 }
5601 }
5602 if (success) {
5603 fHitEnd = FALSE;
5604 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5605 fp->fInputIdx = reverseIndex;
5606 if (fp->fInputIdx > backSearchIndex) {
5607 fp = StateSave(fp, fp->fPatIdx, status);
5608 }
5609 fp->fPatIdx++; // Skip the LOOP_C, we just did that
5610 break;
5611 }
5612 }
5613 }
5614 #endif
5615
5616 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5617 break;
5618
5619 case URX_STRING_I:
5620 {
5621 // Test input against a literal string.
5622 // Strings require two slots in the compiled pattern, one for the
5623 // offset to the string text, and one for the length.
5624 const UCaseProps *csp = ucase_getSingleton();
5625 {
5626 int32_t stringStartIdx, stringLen;
5627 stringStartIdx = opValue;
5628
5629 op = (int32_t)pat[fp->fPatIdx];
5630 fp->fPatIdx++;
5631 opType = URX_TYPE(op);
5632 opValue = URX_VAL(op);
5633 U_ASSERT(opType == URX_STRING_LEN);
5634 stringLen = opValue;
5635
5636 const UChar *patternChars = litText+stringStartIdx;
5637 const UChar *patternEnd = patternChars+stringLen;
5638
5639 const UChar *foldChars = NULL;
5640 int32_t foldOffset, foldLength;
5641 UChar32 c;
5642 UBool c_is_valid = FALSE;
5643
5644 #ifdef REGEX_SMART_BACKTRACKING
5645 int32_t originalInputIdx = fp->fInputIdx;
5646 #endif
5647 UBool success = TRUE;
5648
5649 foldOffset = foldLength = 0;
5650
5651 while (patternChars < patternEnd && success) {
5652 if (fp->fInputIdx < fActiveLimit) { // don't read past end of string
5653 if(foldOffset < foldLength) {
5654 U16_NEXT_UNSAFE(foldChars, foldOffset, c);
5655 c_is_valid = TRUE;
5656 } else {
5657 // test pre-condition of U16_NEXT: i < length
5658 U_ASSERT(fp->fInputIdx < fActiveLimit);
5659 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
5660 c_is_valid = TRUE;
5661 foldLength = ucase_toFullFolding(csp, c, &foldChars, U_FOLD_CASE_DEFAULT);
5662 if(foldLength >= 0) {
5663 if(foldLength <= UCASE_MAX_STRING_LENGTH) { // !!!: Does not correctly handle chars that fold to 0-length strings
5664 foldOffset = 0;
5665 U16_NEXT_UNSAFE(foldChars, foldOffset, c);
5666 } else {
5667 c = foldLength;
5668 foldLength = foldOffset; // to avoid reading chars from the folding buffer
5669 }
5670 }
5671 }
5672 } else {
5673 c_is_valid = FALSE;
5674 }
5675
5676 if (fp->fInputIdx <= fActiveLimit && c_is_valid) {
5677 if (U_IS_BMP(c)) {
5678 success = (*patternChars == c);
5679 patternChars += 1;
5680 } else if (patternChars+1 < patternEnd) {
5681 success = (*patternChars == U16_LEAD(c) && *(patternChars+1) == U16_TRAIL(c));
5682 patternChars += 2;
5683 }
5684 } else {
5685 success = FALSE;
5686 fHitEnd = TRUE; // TODO: See ticket 6074
5687 }
5688 }
5689
5690 if (!success) {
5691 #ifdef REGEX_SMART_BACKTRACKING
5692 if (fp->fInputIdx > backSearchIndex && fStack->size()) {
5693 REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
5694 if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
5695 // Reset to last start point
5696 int64_t reverseIndex = originalInputIdx;
5697 patternChars = litText+stringStartIdx;
5698
5699 // Search backwards for a possible start
5700 do {
5701 U16_PREV(inputBuf, backSearchIndex, reverseIndex, c);
5702 foldLength = ucase_toFullFolding(csp, c, &foldChars, U_FOLD_CASE_DEFAULT);
5703 if(foldLength >= 0) {
5704 if(foldLength <= UCASE_MAX_STRING_LENGTH) { // !!!: Does not correctly handle chars that fold to 0-length strings
5705 foldOffset = 0;
5706 U16_NEXT_UNSAFE(foldChars, foldOffset, c);
5707 } else {
5708 c = foldLength;
5709 foldLength = foldOffset; // to avoid reading chars from the folding buffer
5710 }
5711 }
5712
5713 if ((U_IS_BMP(c) && *patternChars == c) ||
5714 (*patternChars == U16_LEAD(c) && *(patternChars+1) == U16_TRAIL(c))) {
5715 success = TRUE;
5716 break;
5717 }
5718 } while (reverseIndex > backSearchIndex);
5719
5720 // And try again
5721 if (success) {
5722 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5723 fp->fInputIdx = reverseIndex;
5724 if (fp->fInputIdx > backSearchIndex) {
5725 fp = StateSave(fp, fp->fPatIdx, status);
5726 }
5727 fp->fPatIdx++; // Skip the LOOP_C, we just did that
5728 break;
5729 }
5730 }
5731 }
5732 #endif
5733 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5734 }
5735 }
5736 }
5737 break;
5738
5739 case URX_LB_START:
5740 {
5741 // Entering a look-behind block.
5742 // Save Stack Ptr, Input Pos.
5743 // TODO: implement transparent bounds. Ticket #6067
5744 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5745 fData[opValue] = fStack->size();
5746 fData[opValue+1] = fp->fInputIdx;
5747 // Init the variable containing the start index for attempted matches.
5748 fData[opValue+2] = -1;
5749 // Save input string length, then reset to pin any matches to end at
5750 // the current position.
5751 fData[opValue+3] = fActiveLimit;
5752 fActiveLimit = fp->fInputIdx;
5753 }
5754 break;
5755
5756
5757 case URX_LB_CONT:
5758 {
5759 // Positive Look-Behind, at top of loop checking for matches of LB expression
5760 // at all possible input starting positions.
5761
5762 // Fetch the min and max possible match lengths. They are the operands
5763 // of this op in the pattern.
5764 int32_t minML = (int32_t)pat[fp->fPatIdx++];
5765 int32_t maxML = (int32_t)pat[fp->fPatIdx++];
5766 U_ASSERT(minML <= maxML);
5767 U_ASSERT(minML >= 0);
5768
5769 // Fetch (from data) the last input index where a match was attempted.
5770 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5771 int64_t *lbStartIdx = &fData[opValue+2];
5772 if (*lbStartIdx < 0) {
5773 // First time through loop.
5774 *lbStartIdx = fp->fInputIdx - minML;
5775 } else {
5776 // 2nd through nth time through the loop.
5777 // Back up start position for match by one.
5778 if (*lbStartIdx == 0) {
5779 (*lbStartIdx)--;
5780 } else {
5781 U16_BACK_1(inputBuf, 0, *lbStartIdx);
5782 }
5783 }
5784
5785 if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) {
5786 // We have tried all potential match starting points without
5787 // getting a match. Backtrack out, and out of the
5788 // Look Behind altogether.
5789 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5790 int64_t restoreInputLen = fData[opValue+3];
5791 U_ASSERT(restoreInputLen >= fActiveLimit);
5792 U_ASSERT(restoreInputLen <= fInputLength);
5793 fActiveLimit = restoreInputLen;
5794 break;
5795 }
5796
5797 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
5798 // (successful match will fall off the end of the loop.)
5799 fp = StateSave(fp, fp->fPatIdx-3, status);
5800 fp->fInputIdx = *lbStartIdx;
5801 }
5802 break;
5803
5804 case URX_LB_END:
5805 // End of a look-behind block, after a successful match.
5806 {
5807 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5808 if (fp->fInputIdx != fActiveLimit) {
5809 // The look-behind expression matched, but the match did not
5810 // extend all the way to the point that we are looking behind from.
5811 // FAIL out of here, which will take us back to the LB_CONT, which
5812 // will retry the match starting at another position or fail
5813 // the look-behind altogether, whichever is appropriate.
5814 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5815 break;
5816 }
5817
5818 // Look-behind match is good. Restore the orignal input string length,
5819 // which had been truncated to pin the end of the lookbehind match to the
5820 // position being looked-behind.
5821 int64_t originalInputLen = fData[opValue+3];
5822 U_ASSERT(originalInputLen >= fActiveLimit);
5823 U_ASSERT(originalInputLen <= fInputLength);
5824 fActiveLimit = originalInputLen;
5825 }
5826 break;
5827
5828
5829 case URX_LBN_CONT:
5830 {
5831 // Negative Look-Behind, at top of loop checking for matches of LB expression
5832 // at all possible input starting positions.
5833
5834 // Fetch the extra parameters of this op.
5835 int32_t minML = (int32_t)pat[fp->fPatIdx++];
5836 int32_t maxML = (int32_t)pat[fp->fPatIdx++];
5837 int32_t continueLoc = (int32_t)pat[fp->fPatIdx++];
5838 continueLoc = URX_VAL(continueLoc);
5839 U_ASSERT(minML <= maxML);
5840 U_ASSERT(minML >= 0);
5841 U_ASSERT(continueLoc > fp->fPatIdx);
5842
5843 // Fetch (from data) the last input index where a match was attempted.
5844 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5845 int64_t *lbStartIdx = &fData[opValue+2];
5846 if (*lbStartIdx < 0) {
5847 // First time through loop.
5848 *lbStartIdx = fp->fInputIdx - minML;
5849 } else {
5850 // 2nd through nth time through the loop.
5851 // Back up start position for match by one.
5852 if (*lbStartIdx == 0) {
5853 (*lbStartIdx)--; // Because U16_BACK is unsafe starting at 0.
5854 } else {
5855 U16_BACK_1(inputBuf, 0, *lbStartIdx);
5856 }
5857 }
5858
5859 if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) {
5860 // We have tried all potential match starting points without
5861 // getting a match, which means that the negative lookbehind as
5862 // a whole has succeeded. Jump forward to the continue location
5863 int64_t restoreInputLen = fData[opValue+3];
5864 U_ASSERT(restoreInputLen >= fActiveLimit);
5865 U_ASSERT(restoreInputLen <= fInputLength);
5866 fActiveLimit = restoreInputLen;
5867 fp->fPatIdx = continueLoc;
5868 break;
5869 }
5870
5871 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
5872 // (successful match will cause a FAIL out of the loop altogether.)
5873 fp = StateSave(fp, fp->fPatIdx-4, status);
5874 fp->fInputIdx = *lbStartIdx;
5875 }
5876 break;
5877
5878 case URX_LBN_END:
5879 // End of a negative look-behind block, after a successful match.
5880 {
5881 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5882 if (fp->fInputIdx != fActiveLimit) {
5883 // The look-behind expression matched, but the match did not
5884 // extend all the way to the point that we are looking behind from.
5885 // FAIL out of here, which will take us back to the LB_CONT, which
5886 // will retry the match starting at another position or succeed
5887 // the look-behind altogether, whichever is appropriate.
5888 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5889 break;
5890 }
5891
5892 // Look-behind expression matched, which means look-behind test as
5893 // a whole Fails
5894
5895 // Restore the orignal input string length, which had been truncated
5896 // inorder to pin the end of the lookbehind match
5897 // to the position being looked-behind.
5898 int64_t originalInputLen = fData[opValue+3];
5899 U_ASSERT(originalInputLen >= fActiveLimit);
5900 U_ASSERT(originalInputLen <= fInputLength);
5901 fActiveLimit = originalInputLen;
5902
5903 // Restore original stack position, discarding any state saved
5904 // by the successful pattern match.
5905 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5906 int32_t newStackSize = (int32_t)fData[opValue];
5907 U_ASSERT(fStack->size() > newStackSize);
5908 fStack->setSize(newStackSize);
5909
5910 // FAIL, which will take control back to someplace
5911 // prior to entering the look-behind test.
5912 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5913 }
5914 break;
5915
5916
5917 case URX_LOOP_SR_I:
5918 // Loop Initialization for the optimized implementation of
5919 // [some character set]*
5920 // This op scans through all matching input.
5921 // The following LOOP_C op emulates stack unwinding if the following pattern fails.
5922 {
5923 U_ASSERT(opValue > 0 && opValue < sets->size());
5924 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
5925 UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue);
5926
5927 // Loop through input, until either the input is exhausted or
5928 // we reach a character that is not a member of the set.
5929 int32_t ix = (int32_t)fp->fInputIdx;
5930 for (;;) {
5931 if (ix >= fActiveLimit) {
5932 fHitEnd = TRUE;
5933 break;
5934 }
5935 UChar32 c;
5936 U16_NEXT(inputBuf, ix, fActiveLimit, c);
5937 if (c<256) {
5938 if (s8->contains(c) == FALSE) {
5939 U16_BACK_1(inputBuf, 0, ix);
5940 break;
5941 }
5942 } else {
5943 if (s->contains(c) == FALSE) {
5944 U16_BACK_1(inputBuf, 0, ix);
5945 break;
5946 }
5947 }
5948 }
5949
5950 // If there were no matching characters, skip over the loop altogether.
5951 // The loop doesn't run at all, a * op always succeeds.
5952 if (ix == fp->fInputIdx) {
5953 fp->fPatIdx++; // skip the URX_LOOP_C op.
5954 break;
5955 }
5956
5957 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
5958 // must follow. It's operand is the stack location
5959 // that holds the starting input index for the match of this [set]*
5960 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
5961 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
5962 int32_t stackLoc = URX_VAL(loopcOp);
5963 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
5964 fp->fExtra[stackLoc] = fp->fInputIdx;
5965 #ifdef REGEX_SMART_BACKTRACKING
5966 backSearchIndex = fp->fInputIdx;
5967 #endif
5968 fp->fInputIdx = ix;
5969
5970 // Save State to the URX_LOOP_C op that follows this one,
5971 // so that match failures in the following code will return to there.
5972 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
5973 fp = StateSave(fp, fp->fPatIdx, status);
5974 fp->fPatIdx++;
5975 }
5976 break;
5977
5978
5979 case URX_LOOP_DOT_I:
5980 // Loop Initialization for the optimized implementation of .*
5981 // This op scans through all remaining input.
5982 // The following LOOP_C op emulates stack unwinding if the following pattern fails.
5983 {
5984 // Loop through input until the input is exhausted (we reach an end-of-line)
5985 // In DOTALL mode, we can just go straight to the end of the input.
5986 int32_t ix;
5987 if ((opValue & 1) == 1) {
5988 // Dot-matches-All mode. Jump straight to the end of the string.
5989 ix = (int32_t)fActiveLimit;
5990 fHitEnd = TRUE;
5991 } else {
5992 // NOT DOT ALL mode. Line endings do not match '.'
5993 // Scan forward until a line ending or end of input.
5994 ix = (int32_t)fp->fInputIdx;
5995 for (;;) {
5996 if (ix >= fActiveLimit) {
5997 fHitEnd = TRUE;
5998 break;
5999 }
6000 UChar32 c;
6001 U16_NEXT(inputBuf, ix, fActiveLimit, c); // c = inputBuf[ix++]
6002 if ((c & 0x7f) <= 0x29) { // Fast filter of non-new-line-s
6003 if ((c == 0x0a) || // 0x0a is newline in both modes.
6004 (((opValue & 2) == 0) && // IF not UNIX_LINES mode
6005 ((c<=0x0d && c>=0x0a) || c==0x85 || c==0x2028 || c==0x2029))) {
6006 // char is a line ending. Put the input pos back to the
6007 // line ending char, and exit the scanning loop.
6008 U16_BACK_1(inputBuf, 0, ix);
6009 break;
6010 }
6011 }
6012 }
6013 }
6014
6015 // If there were no matching characters, skip over the loop altogether.
6016 // The loop doesn't run at all, a * op always succeeds.
6017 if (ix == fp->fInputIdx) {
6018 fp->fPatIdx++; // skip the URX_LOOP_C op.
6019 break;
6020 }
6021
6022 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
6023 // must follow. It's operand is the stack location
6024 // that holds the starting input index for the match of this .*
6025 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
6026 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
6027 int32_t stackLoc = URX_VAL(loopcOp);
6028 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
6029 fp->fExtra[stackLoc] = fp->fInputIdx;
6030 #ifdef REGEX_SMART_BACKTRACKING
6031 backSearchIndex = fp->fInputIdx;
6032 #endif
6033 fp->fInputIdx = ix;
6034
6035 // Save State to the URX_LOOP_C op that follows this one,
6036 // so that match failures in the following code will return to there.
6037 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
6038 fp = StateSave(fp, fp->fPatIdx, status);
6039 fp->fPatIdx++;
6040 }
6041 break;
6042
6043
6044 case URX_LOOP_C:
6045 {
6046 U_ASSERT(opValue>=0 && opValue<fFrameSize);
6047 backSearchIndex = (int32_t)fp->fExtra[opValue];
6048 U_ASSERT(backSearchIndex <= fp->fInputIdx);
6049 if (backSearchIndex == fp->fInputIdx) {
6050 // We've backed up the input idx to the point that the loop started.
6051 // The loop is done. Leave here without saving state.
6052 // Subsequent failures won't come back here.
6053 break;
6054 }
6055 // Set up for the next iteration of the loop, with input index
6056 // backed up by one from the last time through,
6057 // and a state save to this instruction in case the following code fails again.
6058 // (We're going backwards because this loop emulates stack unwinding, not
6059 // the initial scan forward.)
6060 U_ASSERT(fp->fInputIdx > 0);
6061 UChar32 prevC;
6062 U16_PREV(inputBuf, 0, fp->fInputIdx, prevC); // !!!: should this 0 be one of f*Limit?
6063
6064 if (prevC == 0x0a &&
6065 fp->fInputIdx > backSearchIndex &&
6066 inputBuf[fp->fInputIdx-1] == 0x0d) {
6067 int32_t prevOp = (int32_t)pat[fp->fPatIdx-2];
6068 if (URX_TYPE(prevOp) == URX_LOOP_DOT_I) {
6069 // .*, stepping back over CRLF pair.
6070 U16_BACK_1(inputBuf, 0, fp->fInputIdx);
6071 }
6072 }
6073
6074
6075 fp = StateSave(fp, fp->fPatIdx-1, status);
6076 }
6077 break;
6078
6079
6080
6081 default:
6082 // Trouble. The compiled pattern contains an entry with an
6083 // unrecognized type tag.
6084 U_ASSERT(FALSE);
6085 }
6086
6087 if (U_FAILURE(status)) {
6088 isMatch = FALSE;
6089 break;
6090 }
6091 }
6092
6093 breakFromLoop:
6094 fMatch = isMatch;
6095 if (isMatch) {
6096 fLastMatchEnd = fMatchEnd;
6097 fMatchStart = startIdx;
6098 fMatchEnd = fp->fInputIdx;
6099 if (fTraceDebug) {
6100 REGEX_RUN_DEBUG_PRINTF(("Match. start=%d end=%d\n\n", fMatchStart, fMatchEnd));
6101 }
6102 }
6103 else
6104 {
6105 if (fTraceDebug) {
6106 REGEX_RUN_DEBUG_PRINTF(("No match\n\n"));
6107 }
6108 }
6109
6110 fFrame = fp; // The active stack frame when the engine stopped.
6111 // Contains the capture group results that we need to
6112 // access later.
6113
6114 return;
6115 }
6116
6117
6118 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RegexMatcher)
6119
6120 U_NAMESPACE_END
6121
6122 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS
6123