1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 **************************************************************************
5 * Copyright (C) 2002-2016 International Business Machines Corporation
6 * and others. All rights reserved.
7 **************************************************************************
8 */
9 //
10 // file: rematch.cpp
11 //
12 // Contains the implementation of class RegexMatcher,
13 // which is one of the main API classes for the ICU regular expression package.
14 //
15
16 #include "unicode/utypes.h"
17 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
18
19 #include "unicode/regex.h"
20 #include "unicode/uniset.h"
21 #include "unicode/uchar.h"
22 #include "unicode/ustring.h"
23 #include "unicode/rbbi.h"
24 #include "unicode/utf.h"
25 #include "unicode/utf16.h"
26 #include "uassert.h"
27 #include "cmemory.h"
28 #include "cstr.h"
29 #include "uvector.h"
30 #include "uvectr32.h"
31 #include "uvectr64.h"
32 #include "regeximp.h"
33 #include "regexst.h"
34 #include "regextxt.h"
35 #include "ucase.h"
36
37 // #include <malloc.h> // Needed for heapcheck testing
38
39
40 U_NAMESPACE_BEGIN
41
42 // Default limit for the size of the back track stack, to avoid system
43 // failures causedby heap exhaustion. Units are in 32 bit words, not bytes.
44 // This value puts ICU's limits higher than most other regexp implementations,
45 // which use recursion rather than the heap, and take more storage per
46 // backtrack point.
47 //
48 static const int32_t DEFAULT_BACKTRACK_STACK_CAPACITY = 8000000;
49
50 // Time limit counter constant.
51 // Time limits for expression evaluation are in terms of quanta of work by
52 // the engine, each of which is 10,000 state saves.
53 // This constant determines that state saves per tick number.
54 static const int32_t TIMER_INITIAL_VALUE = 10000;
55
56
57 // Test for any of the Unicode line terminating characters.
isLineTerminator(UChar32 c)58 static inline UBool isLineTerminator(UChar32 c) {
59 if (c & ~(0x0a | 0x0b | 0x0c | 0x0d | 0x85 | 0x2028 | 0x2029)) {
60 return false;
61 }
62 return (c<=0x0d && c>=0x0a) || c==0x85 || c==0x2028 || c==0x2029;
63 }
64
65 //-----------------------------------------------------------------------------
66 //
67 // Constructor and Destructor
68 //
69 //-----------------------------------------------------------------------------
RegexMatcher(const RegexPattern * pat)70 RegexMatcher::RegexMatcher(const RegexPattern *pat) {
71 fDeferredStatus = U_ZERO_ERROR;
72 init(fDeferredStatus);
73 if (U_FAILURE(fDeferredStatus)) {
74 return;
75 }
76 if (pat==NULL) {
77 fDeferredStatus = U_ILLEGAL_ARGUMENT_ERROR;
78 return;
79 }
80 fPattern = pat;
81 init2(RegexStaticSets::gStaticSets->fEmptyText, fDeferredStatus);
82 }
83
84
85
RegexMatcher(const UnicodeString & regexp,const UnicodeString & input,uint32_t flags,UErrorCode & status)86 RegexMatcher::RegexMatcher(const UnicodeString ®exp, const UnicodeString &input,
87 uint32_t flags, UErrorCode &status) {
88 init(status);
89 if (U_FAILURE(status)) {
90 return;
91 }
92 UParseError pe;
93 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status);
94 fPattern = fPatternOwned;
95
96 UText inputText = UTEXT_INITIALIZER;
97 utext_openConstUnicodeString(&inputText, &input, &status);
98 init2(&inputText, status);
99 utext_close(&inputText);
100
101 fInputUniStrMaybeMutable = TRUE;
102 }
103
104
RegexMatcher(UText * regexp,UText * input,uint32_t flags,UErrorCode & status)105 RegexMatcher::RegexMatcher(UText *regexp, UText *input,
106 uint32_t flags, UErrorCode &status) {
107 init(status);
108 if (U_FAILURE(status)) {
109 return;
110 }
111 UParseError pe;
112 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status);
113 if (U_FAILURE(status)) {
114 return;
115 }
116
117 fPattern = fPatternOwned;
118 init2(input, status);
119 }
120
121
RegexMatcher(const UnicodeString & regexp,uint32_t flags,UErrorCode & status)122 RegexMatcher::RegexMatcher(const UnicodeString ®exp,
123 uint32_t flags, UErrorCode &status) {
124 init(status);
125 if (U_FAILURE(status)) {
126 return;
127 }
128 UParseError pe;
129 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status);
130 if (U_FAILURE(status)) {
131 return;
132 }
133 fPattern = fPatternOwned;
134 init2(RegexStaticSets::gStaticSets->fEmptyText, status);
135 }
136
RegexMatcher(UText * regexp,uint32_t flags,UErrorCode & status)137 RegexMatcher::RegexMatcher(UText *regexp,
138 uint32_t flags, UErrorCode &status) {
139 init(status);
140 if (U_FAILURE(status)) {
141 return;
142 }
143 UParseError pe;
144 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status);
145 if (U_FAILURE(status)) {
146 return;
147 }
148
149 fPattern = fPatternOwned;
150 init2(RegexStaticSets::gStaticSets->fEmptyText, status);
151 }
152
153
154
155
~RegexMatcher()156 RegexMatcher::~RegexMatcher() {
157 delete fStack;
158 if (fData != fSmallData) {
159 uprv_free(fData);
160 fData = NULL;
161 }
162 if (fPatternOwned) {
163 delete fPatternOwned;
164 fPatternOwned = NULL;
165 fPattern = NULL;
166 }
167
168 if (fInput) {
169 delete fInput;
170 }
171 if (fInputText) {
172 utext_close(fInputText);
173 }
174 if (fAltInputText) {
175 utext_close(fAltInputText);
176 }
177
178 #if UCONFIG_NO_BREAK_ITERATION==0
179 delete fWordBreakItr;
180 #endif
181 }
182
183 //
184 // init() common initialization for use by all constructors.
185 // Initialize all fields, get the object into a consistent state.
186 // This must be done even when the initial status shows an error,
187 // so that the object is initialized sufficiently well for the destructor
188 // to run safely.
189 //
init(UErrorCode & status)190 void RegexMatcher::init(UErrorCode &status) {
191 fPattern = NULL;
192 fPatternOwned = NULL;
193 fFrameSize = 0;
194 fRegionStart = 0;
195 fRegionLimit = 0;
196 fAnchorStart = 0;
197 fAnchorLimit = 0;
198 fLookStart = 0;
199 fLookLimit = 0;
200 fActiveStart = 0;
201 fActiveLimit = 0;
202 fTransparentBounds = FALSE;
203 fAnchoringBounds = TRUE;
204 fMatch = FALSE;
205 fMatchStart = 0;
206 fMatchEnd = 0;
207 fLastMatchEnd = -1;
208 fAppendPosition = 0;
209 fHitEnd = FALSE;
210 fRequireEnd = FALSE;
211 fStack = NULL;
212 fFrame = NULL;
213 fTimeLimit = 0;
214 fTime = 0;
215 fTickCounter = 0;
216 fStackLimit = DEFAULT_BACKTRACK_STACK_CAPACITY;
217 fCallbackFn = NULL;
218 fCallbackContext = NULL;
219 fFindProgressCallbackFn = NULL;
220 fFindProgressCallbackContext = NULL;
221 fTraceDebug = FALSE;
222 fDeferredStatus = status;
223 fData = fSmallData;
224 fWordBreakItr = NULL;
225
226 fStack = NULL;
227 fInputText = NULL;
228 fAltInputText = NULL;
229 fInput = NULL;
230 fInputLength = 0;
231 fInputUniStrMaybeMutable = FALSE;
232 }
233
234 //
235 // init2() Common initialization for use by RegexMatcher constructors, part 2.
236 // This handles the common setup to be done after the Pattern is available.
237 //
init2(UText * input,UErrorCode & status)238 void RegexMatcher::init2(UText *input, UErrorCode &status) {
239 if (U_FAILURE(status)) {
240 fDeferredStatus = status;
241 return;
242 }
243
244 if (fPattern->fDataSize > UPRV_LENGTHOF(fSmallData)) {
245 fData = (int64_t *)uprv_malloc(fPattern->fDataSize * sizeof(int64_t));
246 if (fData == NULL) {
247 status = fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
248 return;
249 }
250 }
251
252 fStack = new UVector64(status);
253 if (fStack == NULL) {
254 status = fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
255 return;
256 }
257
258 reset(input);
259 setStackLimit(DEFAULT_BACKTRACK_STACK_CAPACITY, status);
260 if (U_FAILURE(status)) {
261 fDeferredStatus = status;
262 return;
263 }
264 }
265
266
267 static const UChar BACKSLASH = 0x5c;
268 static const UChar DOLLARSIGN = 0x24;
269 static const UChar LEFTBRACKET = 0x7b;
270 static const UChar RIGHTBRACKET = 0x7d;
271
272 //--------------------------------------------------------------------------------
273 //
274 // appendReplacement
275 //
276 //--------------------------------------------------------------------------------
appendReplacement(UnicodeString & dest,const UnicodeString & replacement,UErrorCode & status)277 RegexMatcher &RegexMatcher::appendReplacement(UnicodeString &dest,
278 const UnicodeString &replacement,
279 UErrorCode &status) {
280 UText replacementText = UTEXT_INITIALIZER;
281
282 utext_openConstUnicodeString(&replacementText, &replacement, &status);
283 if (U_SUCCESS(status)) {
284 UText resultText = UTEXT_INITIALIZER;
285 utext_openUnicodeString(&resultText, &dest, &status);
286
287 if (U_SUCCESS(status)) {
288 appendReplacement(&resultText, &replacementText, status);
289 utext_close(&resultText);
290 }
291 utext_close(&replacementText);
292 }
293
294 return *this;
295 }
296
297 //
298 // appendReplacement, UText mode
299 //
appendReplacement(UText * dest,UText * replacement,UErrorCode & status)300 RegexMatcher &RegexMatcher::appendReplacement(UText *dest,
301 UText *replacement,
302 UErrorCode &status) {
303 if (U_FAILURE(status)) {
304 return *this;
305 }
306 if (U_FAILURE(fDeferredStatus)) {
307 status = fDeferredStatus;
308 return *this;
309 }
310 if (fMatch == FALSE) {
311 status = U_REGEX_INVALID_STATE;
312 return *this;
313 }
314
315 // Copy input string from the end of previous match to start of current match
316 int64_t destLen = utext_nativeLength(dest);
317 if (fMatchStart > fAppendPosition) {
318 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
319 destLen += utext_replace(dest, destLen, destLen, fInputText->chunkContents+fAppendPosition,
320 (int32_t)(fMatchStart-fAppendPosition), &status);
321 } else {
322 int32_t len16;
323 if (UTEXT_USES_U16(fInputText)) {
324 len16 = (int32_t)(fMatchStart-fAppendPosition);
325 } else {
326 UErrorCode lengthStatus = U_ZERO_ERROR;
327 len16 = utext_extract(fInputText, fAppendPosition, fMatchStart, NULL, 0, &lengthStatus);
328 }
329 UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1));
330 if (inputChars == NULL) {
331 status = U_MEMORY_ALLOCATION_ERROR;
332 return *this;
333 }
334 utext_extract(fInputText, fAppendPosition, fMatchStart, inputChars, len16+1, &status);
335 destLen += utext_replace(dest, destLen, destLen, inputChars, len16, &status);
336 uprv_free(inputChars);
337 }
338 }
339 fAppendPosition = fMatchEnd;
340
341
342 // scan the replacement text, looking for substitutions ($n) and \escapes.
343 // TODO: optimize this loop by efficiently scanning for '$' or '\',
344 // move entire ranges not containing substitutions.
345 UTEXT_SETNATIVEINDEX(replacement, 0);
346 for (UChar32 c = UTEXT_NEXT32(replacement); U_SUCCESS(status) && c != U_SENTINEL; c = UTEXT_NEXT32(replacement)) {
347 if (c == BACKSLASH) {
348 // Backslash Escape. Copy the following char out without further checks.
349 // Note: Surrogate pairs don't need any special handling
350 // The second half wont be a '$' or a '\', and
351 // will move to the dest normally on the next
352 // loop iteration.
353 c = UTEXT_CURRENT32(replacement);
354 if (c == U_SENTINEL) {
355 break;
356 }
357
358 if (c==0x55/*U*/ || c==0x75/*u*/) {
359 // We have a \udddd or \Udddddddd escape sequence.
360 int32_t offset = 0;
361 struct URegexUTextUnescapeCharContext context = U_REGEX_UTEXT_UNESCAPE_CONTEXT(replacement);
362 UChar32 escapedChar = u_unescapeAt(uregex_utext_unescape_charAt, &offset, INT32_MAX, &context);
363 if (escapedChar != (UChar32)0xFFFFFFFF) {
364 if (U_IS_BMP(escapedChar)) {
365 UChar c16 = (UChar)escapedChar;
366 destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
367 } else {
368 UChar surrogate[2];
369 surrogate[0] = U16_LEAD(escapedChar);
370 surrogate[1] = U16_TRAIL(escapedChar);
371 if (U_SUCCESS(status)) {
372 destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);
373 }
374 }
375 // TODO: Report errors for mal-formed \u escapes?
376 // As this is, the original sequence is output, which may be OK.
377 if (context.lastOffset == offset) {
378 (void)UTEXT_PREVIOUS32(replacement);
379 } else if (context.lastOffset != offset-1) {
380 utext_moveIndex32(replacement, offset - context.lastOffset - 1);
381 }
382 }
383 } else {
384 (void)UTEXT_NEXT32(replacement);
385 // Plain backslash escape. Just put out the escaped character.
386 if (U_IS_BMP(c)) {
387 UChar c16 = (UChar)c;
388 destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
389 } else {
390 UChar surrogate[2];
391 surrogate[0] = U16_LEAD(c);
392 surrogate[1] = U16_TRAIL(c);
393 if (U_SUCCESS(status)) {
394 destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);
395 }
396 }
397 }
398 } else if (c != DOLLARSIGN) {
399 // Normal char, not a $. Copy it out without further checks.
400 if (U_IS_BMP(c)) {
401 UChar c16 = (UChar)c;
402 destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
403 } else {
404 UChar surrogate[2];
405 surrogate[0] = U16_LEAD(c);
406 surrogate[1] = U16_TRAIL(c);
407 if (U_SUCCESS(status)) {
408 destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);
409 }
410 }
411 } else {
412 // We've got a $. Pick up a capture group name or number if one follows.
413 // Consume digits so long as the resulting group number <= the number of
414 // number of capture groups in the pattern.
415
416 int32_t groupNum = 0;
417 int32_t numDigits = 0;
418 UChar32 nextChar = utext_current32(replacement);
419 if (nextChar == LEFTBRACKET) {
420 // Scan for a Named Capture Group, ${name}.
421 UnicodeString groupName;
422 utext_next32(replacement);
423 while(U_SUCCESS(status) && nextChar != RIGHTBRACKET) {
424 nextChar = utext_next32(replacement);
425 if (nextChar == U_SENTINEL) {
426 status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
427 } else if ((nextChar >= 0x41 && nextChar <= 0x5a) || // A..Z
428 (nextChar >= 0x61 && nextChar <= 0x7a) || // a..z
429 (nextChar >= 0x31 && nextChar <= 0x39)) { // 0..9
430 groupName.append(nextChar);
431 } else if (nextChar == RIGHTBRACKET) {
432 groupNum = uhash_geti(fPattern->fNamedCaptureMap, &groupName);
433 if (groupNum == 0) {
434 status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
435 }
436 } else {
437 // Character was something other than a name char or a closing '}'
438 status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
439 }
440 }
441
442 } else if (u_isdigit(nextChar)) {
443 // $n Scan for a capture group number
444 int32_t numCaptureGroups = fPattern->fGroupMap->size();
445 for (;;) {
446 nextChar = UTEXT_CURRENT32(replacement);
447 if (nextChar == U_SENTINEL) {
448 break;
449 }
450 if (u_isdigit(nextChar) == FALSE) {
451 break;
452 }
453 int32_t nextDigitVal = u_charDigitValue(nextChar);
454 if (groupNum*10 + nextDigitVal > numCaptureGroups) {
455 // Don't consume the next digit if it makes the capture group number too big.
456 if (numDigits == 0) {
457 status = U_INDEX_OUTOFBOUNDS_ERROR;
458 }
459 break;
460 }
461 (void)UTEXT_NEXT32(replacement);
462 groupNum=groupNum*10 + nextDigitVal;
463 ++numDigits;
464 }
465 } else {
466 // $ not followed by capture group name or number.
467 status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
468 }
469
470 if (U_SUCCESS(status)) {
471 destLen += appendGroup(groupNum, dest, status);
472 }
473 } // End of $ capture group handling
474 } // End of per-character loop through the replacement string.
475
476 return *this;
477 }
478
479
480
481 //--------------------------------------------------------------------------------
482 //
483 // appendTail Intended to be used in conjunction with appendReplacement()
484 // To the destination string, append everything following
485 // the last match position from the input string.
486 //
487 // Note: Match ranges do not affect appendTail or appendReplacement
488 //
489 //--------------------------------------------------------------------------------
appendTail(UnicodeString & dest)490 UnicodeString &RegexMatcher::appendTail(UnicodeString &dest) {
491 UErrorCode status = U_ZERO_ERROR;
492 UText resultText = UTEXT_INITIALIZER;
493 utext_openUnicodeString(&resultText, &dest, &status);
494
495 if (U_SUCCESS(status)) {
496 appendTail(&resultText, status);
497 utext_close(&resultText);
498 }
499
500 return dest;
501 }
502
503 //
504 // appendTail, UText mode
505 //
appendTail(UText * dest,UErrorCode & status)506 UText *RegexMatcher::appendTail(UText *dest, UErrorCode &status) {
507 if (U_FAILURE(status)) {
508 return dest;
509 }
510 if (U_FAILURE(fDeferredStatus)) {
511 status = fDeferredStatus;
512 return dest;
513 }
514
515 if (fInputLength > fAppendPosition) {
516 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
517 int64_t destLen = utext_nativeLength(dest);
518 utext_replace(dest, destLen, destLen, fInputText->chunkContents+fAppendPosition,
519 (int32_t)(fInputLength-fAppendPosition), &status);
520 } else {
521 int32_t len16;
522 if (UTEXT_USES_U16(fInputText)) {
523 len16 = (int32_t)(fInputLength-fAppendPosition);
524 } else {
525 len16 = utext_extract(fInputText, fAppendPosition, fInputLength, NULL, 0, &status);
526 status = U_ZERO_ERROR; // buffer overflow
527 }
528
529 UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16));
530 if (inputChars == NULL) {
531 fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
532 } else {
533 utext_extract(fInputText, fAppendPosition, fInputLength, inputChars, len16, &status); // unterminated
534 int64_t destLen = utext_nativeLength(dest);
535 utext_replace(dest, destLen, destLen, inputChars, len16, &status);
536 uprv_free(inputChars);
537 }
538 }
539 }
540 return dest;
541 }
542
543
544
545 //--------------------------------------------------------------------------------
546 //
547 // end
548 //
549 //--------------------------------------------------------------------------------
end(UErrorCode & err) const550 int32_t RegexMatcher::end(UErrorCode &err) const {
551 return end(0, err);
552 }
553
end64(UErrorCode & err) const554 int64_t RegexMatcher::end64(UErrorCode &err) const {
555 return end64(0, err);
556 }
557
end64(int32_t group,UErrorCode & err) const558 int64_t RegexMatcher::end64(int32_t group, UErrorCode &err) const {
559 if (U_FAILURE(err)) {
560 return -1;
561 }
562 if (fMatch == FALSE) {
563 err = U_REGEX_INVALID_STATE;
564 return -1;
565 }
566 if (group < 0 || group > fPattern->fGroupMap->size()) {
567 err = U_INDEX_OUTOFBOUNDS_ERROR;
568 return -1;
569 }
570 int64_t e = -1;
571 if (group == 0) {
572 e = fMatchEnd;
573 } else {
574 // Get the position within the stack frame of the variables for
575 // this capture group.
576 int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1);
577 U_ASSERT(groupOffset < fPattern->fFrameSize);
578 U_ASSERT(groupOffset >= 0);
579 e = fFrame->fExtra[groupOffset + 1];
580 }
581
582 return e;
583 }
584
end(int32_t group,UErrorCode & err) const585 int32_t RegexMatcher::end(int32_t group, UErrorCode &err) const {
586 return (int32_t)end64(group, err);
587 }
588
589 //--------------------------------------------------------------------------------
590 //
591 // findProgressInterrupt This function is called once for each advance in the target
592 // string from the find() function, and calls the user progress callback
593 // function if there is one installed.
594 //
595 // Return: TRUE if the find operation is to be terminated.
596 // FALSE if the find operation is to continue running.
597 //
598 //--------------------------------------------------------------------------------
findProgressInterrupt(int64_t pos,UErrorCode & status)599 UBool RegexMatcher::findProgressInterrupt(int64_t pos, UErrorCode &status) {
600 if (fFindProgressCallbackFn && !(*fFindProgressCallbackFn)(fFindProgressCallbackContext, pos)) {
601 status = U_REGEX_STOPPED_BY_CALLER;
602 return TRUE;
603 }
604 return FALSE;
605 }
606
607 //--------------------------------------------------------------------------------
608 //
609 // find()
610 //
611 //--------------------------------------------------------------------------------
find()612 UBool RegexMatcher::find() {
613 if (U_FAILURE(fDeferredStatus)) {
614 return FALSE;
615 }
616 UErrorCode status = U_ZERO_ERROR;
617 UBool result = find(status);
618 return result;
619 }
620
621 //--------------------------------------------------------------------------------
622 //
623 // find()
624 //
625 //--------------------------------------------------------------------------------
find(UErrorCode & status)626 UBool RegexMatcher::find(UErrorCode &status) {
627 // Start at the position of the last match end. (Will be zero if the
628 // matcher has been reset.)
629 //
630 if (U_FAILURE(status)) {
631 return FALSE;
632 }
633 if (U_FAILURE(fDeferredStatus)) {
634 status = fDeferredStatus;
635 return FALSE;
636 }
637
638 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
639 return findUsingChunk(status);
640 }
641
642 int64_t startPos = fMatchEnd;
643 if (startPos==0) {
644 startPos = fActiveStart;
645 }
646
647 if (fMatch) {
648 // Save the position of any previous successful match.
649 fLastMatchEnd = fMatchEnd;
650
651 if (fMatchStart == fMatchEnd) {
652 // Previous match had zero length. Move start position up one position
653 // to avoid sending find() into a loop on zero-length matches.
654 if (startPos >= fActiveLimit) {
655 fMatch = FALSE;
656 fHitEnd = TRUE;
657 return FALSE;
658 }
659 UTEXT_SETNATIVEINDEX(fInputText, startPos);
660 (void)UTEXT_NEXT32(fInputText);
661 startPos = UTEXT_GETNATIVEINDEX(fInputText);
662 }
663 } else {
664 if (fLastMatchEnd >= 0) {
665 // A previous find() failed to match. Don't try again.
666 // (without this test, a pattern with a zero-length match
667 // could match again at the end of an input string.)
668 fHitEnd = TRUE;
669 return FALSE;
670 }
671 }
672
673
674 // Compute the position in the input string beyond which a match can not begin, because
675 // the minimum length match would extend past the end of the input.
676 // Note: some patterns that cannot match anything will have fMinMatchLength==Max Int.
677 // Be aware of possible overflows if making changes here.
678 int64_t testStartLimit;
679 if (UTEXT_USES_U16(fInputText)) {
680 testStartLimit = fActiveLimit - fPattern->fMinMatchLen;
681 if (startPos > testStartLimit) {
682 fMatch = FALSE;
683 fHitEnd = TRUE;
684 return FALSE;
685 }
686 } else {
687 // We don't know exactly how long the minimum match length is in native characters.
688 // Treat anything > 0 as 1.
689 testStartLimit = fActiveLimit - (fPattern->fMinMatchLen > 0 ? 1 : 0);
690 }
691
692 UChar32 c;
693 U_ASSERT(startPos >= 0);
694
695 switch (fPattern->fStartType) {
696 case START_NO_INFO:
697 // No optimization was found.
698 // Try a match at each input position.
699 for (;;) {
700 MatchAt(startPos, FALSE, status);
701 if (U_FAILURE(status)) {
702 return FALSE;
703 }
704 if (fMatch) {
705 return TRUE;
706 }
707 if (startPos >= testStartLimit) {
708 fHitEnd = TRUE;
709 return FALSE;
710 }
711 UTEXT_SETNATIVEINDEX(fInputText, startPos);
712 (void)UTEXT_NEXT32(fInputText);
713 startPos = UTEXT_GETNATIVEINDEX(fInputText);
714 // Note that it's perfectly OK for a pattern to have a zero-length
715 // match at the end of a string, so we must make sure that the loop
716 // runs with startPos == testStartLimit the last time through.
717 if (findProgressInterrupt(startPos, status))
718 return FALSE;
719 }
720 U_ASSERT(FALSE);
721
722 case START_START:
723 // Matches are only possible at the start of the input string
724 // (pattern begins with ^ or \A)
725 if (startPos > fActiveStart) {
726 fMatch = FALSE;
727 return FALSE;
728 }
729 MatchAt(startPos, FALSE, status);
730 if (U_FAILURE(status)) {
731 return FALSE;
732 }
733 return fMatch;
734
735
736 case START_SET:
737 {
738 // Match may start on any char from a pre-computed set.
739 U_ASSERT(fPattern->fMinMatchLen > 0);
740 UTEXT_SETNATIVEINDEX(fInputText, startPos);
741 for (;;) {
742 int64_t pos = startPos;
743 c = UTEXT_NEXT32(fInputText);
744 startPos = UTEXT_GETNATIVEINDEX(fInputText);
745 // c will be -1 (U_SENTINEL) at end of text, in which case we
746 // skip this next block (so we don't have a negative array index)
747 // and handle end of text in the following block.
748 if (c >= 0 && ((c<256 && fPattern->fInitialChars8->contains(c)) ||
749 (c>=256 && fPattern->fInitialChars->contains(c)))) {
750 MatchAt(pos, FALSE, status);
751 if (U_FAILURE(status)) {
752 return FALSE;
753 }
754 if (fMatch) {
755 return TRUE;
756 }
757 UTEXT_SETNATIVEINDEX(fInputText, pos);
758 }
759 if (startPos > testStartLimit) {
760 fMatch = FALSE;
761 fHitEnd = TRUE;
762 return FALSE;
763 }
764 if (findProgressInterrupt(startPos, status))
765 return FALSE;
766 }
767 }
768 U_ASSERT(FALSE);
769
770 case START_STRING:
771 case START_CHAR:
772 {
773 // Match starts on exactly one char.
774 U_ASSERT(fPattern->fMinMatchLen > 0);
775 UChar32 theChar = fPattern->fInitialChar;
776 UTEXT_SETNATIVEINDEX(fInputText, startPos);
777 for (;;) {
778 int64_t pos = startPos;
779 c = UTEXT_NEXT32(fInputText);
780 startPos = UTEXT_GETNATIVEINDEX(fInputText);
781 if (c == theChar) {
782 MatchAt(pos, FALSE, status);
783 if (U_FAILURE(status)) {
784 return FALSE;
785 }
786 if (fMatch) {
787 return TRUE;
788 }
789 UTEXT_SETNATIVEINDEX(fInputText, startPos);
790 }
791 if (startPos > testStartLimit) {
792 fMatch = FALSE;
793 fHitEnd = TRUE;
794 return FALSE;
795 }
796 if (findProgressInterrupt(startPos, status))
797 return FALSE;
798 }
799 }
800 U_ASSERT(FALSE);
801
802 case START_LINE:
803 {
804 UChar32 ch;
805 if (startPos == fAnchorStart) {
806 MatchAt(startPos, FALSE, status);
807 if (U_FAILURE(status)) {
808 return FALSE;
809 }
810 if (fMatch) {
811 return TRUE;
812 }
813 UTEXT_SETNATIVEINDEX(fInputText, startPos);
814 ch = UTEXT_NEXT32(fInputText);
815 startPos = UTEXT_GETNATIVEINDEX(fInputText);
816 } else {
817 UTEXT_SETNATIVEINDEX(fInputText, startPos);
818 ch = UTEXT_PREVIOUS32(fInputText);
819 UTEXT_SETNATIVEINDEX(fInputText, startPos);
820 }
821
822 if (fPattern->fFlags & UREGEX_UNIX_LINES) {
823 for (;;) {
824 if (ch == 0x0a) {
825 MatchAt(startPos, FALSE, status);
826 if (U_FAILURE(status)) {
827 return FALSE;
828 }
829 if (fMatch) {
830 return TRUE;
831 }
832 UTEXT_SETNATIVEINDEX(fInputText, startPos);
833 }
834 if (startPos >= testStartLimit) {
835 fMatch = FALSE;
836 fHitEnd = TRUE;
837 return FALSE;
838 }
839 ch = UTEXT_NEXT32(fInputText);
840 startPos = UTEXT_GETNATIVEINDEX(fInputText);
841 // Note that it's perfectly OK for a pattern to have a zero-length
842 // match at the end of a string, so we must make sure that the loop
843 // runs with startPos == testStartLimit the last time through.
844 if (findProgressInterrupt(startPos, status))
845 return FALSE;
846 }
847 } else {
848 for (;;) {
849 if (isLineTerminator(ch)) {
850 if (ch == 0x0d && startPos < fActiveLimit && UTEXT_CURRENT32(fInputText) == 0x0a) {
851 (void)UTEXT_NEXT32(fInputText);
852 startPos = UTEXT_GETNATIVEINDEX(fInputText);
853 }
854 MatchAt(startPos, FALSE, status);
855 if (U_FAILURE(status)) {
856 return FALSE;
857 }
858 if (fMatch) {
859 return TRUE;
860 }
861 UTEXT_SETNATIVEINDEX(fInputText, startPos);
862 }
863 if (startPos >= testStartLimit) {
864 fMatch = FALSE;
865 fHitEnd = TRUE;
866 return FALSE;
867 }
868 ch = UTEXT_NEXT32(fInputText);
869 startPos = UTEXT_GETNATIVEINDEX(fInputText);
870 // Note that it's perfectly OK for a pattern to have a zero-length
871 // match at the end of a string, so we must make sure that the loop
872 // runs with startPos == testStartLimit the last time through.
873 if (findProgressInterrupt(startPos, status))
874 return FALSE;
875 }
876 }
877 }
878
879 default:
880 U_ASSERT(FALSE);
881 }
882
883 U_ASSERT(FALSE);
884 return FALSE;
885 }
886
887
888
find(int64_t start,UErrorCode & status)889 UBool RegexMatcher::find(int64_t start, UErrorCode &status) {
890 if (U_FAILURE(status)) {
891 return FALSE;
892 }
893 if (U_FAILURE(fDeferredStatus)) {
894 status = fDeferredStatus;
895 return FALSE;
896 }
897 this->reset(); // Note: Reset() is specified by Java Matcher documentation.
898 // This will reset the region to be the full input length.
899 if (start < 0) {
900 status = U_INDEX_OUTOFBOUNDS_ERROR;
901 return FALSE;
902 }
903
904 int64_t nativeStart = start;
905 if (nativeStart < fActiveStart || nativeStart > fActiveLimit) {
906 status = U_INDEX_OUTOFBOUNDS_ERROR;
907 return FALSE;
908 }
909 fMatchEnd = nativeStart;
910 return find(status);
911 }
912
913
914 //--------------------------------------------------------------------------------
915 //
916 // findUsingChunk() -- like find(), but with the advance knowledge that the
917 // entire string is available in the UText's chunk buffer.
918 //
919 //--------------------------------------------------------------------------------
findUsingChunk(UErrorCode & status)920 UBool RegexMatcher::findUsingChunk(UErrorCode &status) {
921 // Start at the position of the last match end. (Will be zero if the
922 // matcher has been reset.
923 //
924
925 int32_t startPos = (int32_t)fMatchEnd;
926 if (startPos==0) {
927 startPos = (int32_t)fActiveStart;
928 }
929
930 const UChar *inputBuf = fInputText->chunkContents;
931
932 if (fMatch) {
933 // Save the position of any previous successful match.
934 fLastMatchEnd = fMatchEnd;
935
936 if (fMatchStart == fMatchEnd) {
937 // Previous match had zero length. Move start position up one position
938 // to avoid sending find() into a loop on zero-length matches.
939 if (startPos >= fActiveLimit) {
940 fMatch = FALSE;
941 fHitEnd = TRUE;
942 return FALSE;
943 }
944 U16_FWD_1(inputBuf, startPos, fInputLength);
945 }
946 } else {
947 if (fLastMatchEnd >= 0) {
948 // A previous find() failed to match. Don't try again.
949 // (without this test, a pattern with a zero-length match
950 // could match again at the end of an input string.)
951 fHitEnd = TRUE;
952 return FALSE;
953 }
954 }
955
956
957 // Compute the position in the input string beyond which a match can not begin, because
958 // the minimum length match would extend past the end of the input.
959 // Note: some patterns that cannot match anything will have fMinMatchLength==Max Int.
960 // Be aware of possible overflows if making changes here.
961 // Note: a match can begin at inputBuf + testLen; it is an inclusive limit.
962 int32_t testLen = (int32_t)(fActiveLimit - fPattern->fMinMatchLen);
963 if (startPos > testLen) {
964 fMatch = FALSE;
965 fHitEnd = TRUE;
966 return FALSE;
967 }
968
969 UChar32 c;
970 U_ASSERT(startPos >= 0);
971
972 switch (fPattern->fStartType) {
973 case START_NO_INFO:
974 // No optimization was found.
975 // Try a match at each input position.
976 for (;;) {
977 MatchChunkAt(startPos, FALSE, status);
978 if (U_FAILURE(status)) {
979 return FALSE;
980 }
981 if (fMatch) {
982 return TRUE;
983 }
984 if (startPos >= testLen) {
985 fHitEnd = TRUE;
986 return FALSE;
987 }
988 U16_FWD_1(inputBuf, startPos, fActiveLimit);
989 // Note that it's perfectly OK for a pattern to have a zero-length
990 // match at the end of a string, so we must make sure that the loop
991 // runs with startPos == testLen the last time through.
992 if (findProgressInterrupt(startPos, status))
993 return FALSE;
994 }
995 U_ASSERT(FALSE);
996
997 case START_START:
998 // Matches are only possible at the start of the input string
999 // (pattern begins with ^ or \A)
1000 if (startPos > fActiveStart) {
1001 fMatch = FALSE;
1002 return FALSE;
1003 }
1004 MatchChunkAt(startPos, FALSE, status);
1005 if (U_FAILURE(status)) {
1006 return FALSE;
1007 }
1008 return fMatch;
1009
1010
1011 case START_SET:
1012 {
1013 // Match may start on any char from a pre-computed set.
1014 U_ASSERT(fPattern->fMinMatchLen > 0);
1015 for (;;) {
1016 int32_t pos = startPos;
1017 U16_NEXT(inputBuf, startPos, fActiveLimit, c); // like c = inputBuf[startPos++];
1018 if ((c<256 && fPattern->fInitialChars8->contains(c)) ||
1019 (c>=256 && fPattern->fInitialChars->contains(c))) {
1020 MatchChunkAt(pos, FALSE, status);
1021 if (U_FAILURE(status)) {
1022 return FALSE;
1023 }
1024 if (fMatch) {
1025 return TRUE;
1026 }
1027 }
1028 if (startPos > testLen) {
1029 fMatch = FALSE;
1030 fHitEnd = TRUE;
1031 return FALSE;
1032 }
1033 if (findProgressInterrupt(startPos, status))
1034 return FALSE;
1035 }
1036 }
1037 U_ASSERT(FALSE);
1038
1039 case START_STRING:
1040 case START_CHAR:
1041 {
1042 // Match starts on exactly one char.
1043 U_ASSERT(fPattern->fMinMatchLen > 0);
1044 UChar32 theChar = fPattern->fInitialChar;
1045 for (;;) {
1046 int32_t pos = startPos;
1047 U16_NEXT(inputBuf, startPos, fActiveLimit, c); // like c = inputBuf[startPos++];
1048 if (c == theChar) {
1049 MatchChunkAt(pos, FALSE, status);
1050 if (U_FAILURE(status)) {
1051 return FALSE;
1052 }
1053 if (fMatch) {
1054 return TRUE;
1055 }
1056 }
1057 if (startPos > testLen) {
1058 fMatch = FALSE;
1059 fHitEnd = TRUE;
1060 return FALSE;
1061 }
1062 if (findProgressInterrupt(startPos, status))
1063 return FALSE;
1064 }
1065 }
1066 U_ASSERT(FALSE);
1067
1068 case START_LINE:
1069 {
1070 UChar32 ch;
1071 if (startPos == fAnchorStart) {
1072 MatchChunkAt(startPos, FALSE, status);
1073 if (U_FAILURE(status)) {
1074 return FALSE;
1075 }
1076 if (fMatch) {
1077 return TRUE;
1078 }
1079 U16_FWD_1(inputBuf, startPos, fActiveLimit);
1080 }
1081
1082 if (fPattern->fFlags & UREGEX_UNIX_LINES) {
1083 for (;;) {
1084 ch = inputBuf[startPos-1];
1085 if (ch == 0x0a) {
1086 MatchChunkAt(startPos, FALSE, status);
1087 if (U_FAILURE(status)) {
1088 return FALSE;
1089 }
1090 if (fMatch) {
1091 return TRUE;
1092 }
1093 }
1094 if (startPos >= testLen) {
1095 fMatch = FALSE;
1096 fHitEnd = TRUE;
1097 return FALSE;
1098 }
1099 U16_FWD_1(inputBuf, startPos, fActiveLimit);
1100 // Note that it's perfectly OK for a pattern to have a zero-length
1101 // match at the end of a string, so we must make sure that the loop
1102 // runs with startPos == testLen the last time through.
1103 if (findProgressInterrupt(startPos, status))
1104 return FALSE;
1105 }
1106 } else {
1107 for (;;) {
1108 ch = inputBuf[startPos-1];
1109 if (isLineTerminator(ch)) {
1110 if (ch == 0x0d && startPos < fActiveLimit && inputBuf[startPos] == 0x0a) {
1111 startPos++;
1112 }
1113 MatchChunkAt(startPos, FALSE, status);
1114 if (U_FAILURE(status)) {
1115 return FALSE;
1116 }
1117 if (fMatch) {
1118 return TRUE;
1119 }
1120 }
1121 if (startPos >= testLen) {
1122 fMatch = FALSE;
1123 fHitEnd = TRUE;
1124 return FALSE;
1125 }
1126 U16_FWD_1(inputBuf, startPos, fActiveLimit);
1127 // Note that it's perfectly OK for a pattern to have a zero-length
1128 // match at the end of a string, so we must make sure that the loop
1129 // runs with startPos == testLen the last time through.
1130 if (findProgressInterrupt(startPos, status))
1131 return FALSE;
1132 }
1133 }
1134 }
1135
1136 default:
1137 U_ASSERT(FALSE);
1138 }
1139
1140 U_ASSERT(FALSE);
1141 return FALSE;
1142 }
1143
1144
1145
1146 //--------------------------------------------------------------------------------
1147 //
1148 // group()
1149 //
1150 //--------------------------------------------------------------------------------
group(UErrorCode & status) const1151 UnicodeString RegexMatcher::group(UErrorCode &status) const {
1152 return group(0, status);
1153 }
1154
1155 // Return immutable shallow clone
group(UText * dest,int64_t & group_len,UErrorCode & status) const1156 UText *RegexMatcher::group(UText *dest, int64_t &group_len, UErrorCode &status) const {
1157 return group(0, dest, group_len, status);
1158 }
1159
1160 // Return immutable shallow clone
group(int32_t groupNum,UText * dest,int64_t & group_len,UErrorCode & status) const1161 UText *RegexMatcher::group(int32_t groupNum, UText *dest, int64_t &group_len, UErrorCode &status) const {
1162 group_len = 0;
1163 if (U_FAILURE(status)) {
1164 return dest;
1165 }
1166 if (U_FAILURE(fDeferredStatus)) {
1167 status = fDeferredStatus;
1168 } else if (fMatch == FALSE) {
1169 status = U_REGEX_INVALID_STATE;
1170 } else if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) {
1171 status = U_INDEX_OUTOFBOUNDS_ERROR;
1172 }
1173
1174 if (U_FAILURE(status)) {
1175 return dest;
1176 }
1177
1178 int64_t s, e;
1179 if (groupNum == 0) {
1180 s = fMatchStart;
1181 e = fMatchEnd;
1182 } else {
1183 int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1);
1184 U_ASSERT(groupOffset < fPattern->fFrameSize);
1185 U_ASSERT(groupOffset >= 0);
1186 s = fFrame->fExtra[groupOffset];
1187 e = fFrame->fExtra[groupOffset+1];
1188 }
1189
1190 if (s < 0) {
1191 // A capture group wasn't part of the match
1192 return utext_clone(dest, fInputText, FALSE, TRUE, &status);
1193 }
1194 U_ASSERT(s <= e);
1195 group_len = e - s;
1196
1197 dest = utext_clone(dest, fInputText, FALSE, TRUE, &status);
1198 if (dest)
1199 UTEXT_SETNATIVEINDEX(dest, s);
1200 return dest;
1201 }
1202
group(int32_t groupNum,UErrorCode & status) const1203 UnicodeString RegexMatcher::group(int32_t groupNum, UErrorCode &status) const {
1204 UnicodeString result;
1205 int64_t groupStart = start64(groupNum, status);
1206 int64_t groupEnd = end64(groupNum, status);
1207 if (U_FAILURE(status) || groupStart == -1 || groupStart == groupEnd) {
1208 return result;
1209 }
1210
1211 // Get the group length using a utext_extract preflight.
1212 // UText is actually pretty efficient at this when underlying encoding is UTF-16.
1213 int32_t length = utext_extract(fInputText, groupStart, groupEnd, NULL, 0, &status);
1214 if (status != U_BUFFER_OVERFLOW_ERROR) {
1215 return result;
1216 }
1217
1218 status = U_ZERO_ERROR;
1219 UChar *buf = result.getBuffer(length);
1220 if (buf == NULL) {
1221 status = U_MEMORY_ALLOCATION_ERROR;
1222 } else {
1223 int32_t extractLength = utext_extract(fInputText, groupStart, groupEnd, buf, length, &status);
1224 result.releaseBuffer(extractLength);
1225 U_ASSERT(length == extractLength);
1226 }
1227 return result;
1228 }
1229
1230
1231 //--------------------------------------------------------------------------------
1232 //
1233 // appendGroup() -- currently internal only, appends a group to a UText rather
1234 // than replacing its contents
1235 //
1236 //--------------------------------------------------------------------------------
1237
appendGroup(int32_t groupNum,UText * dest,UErrorCode & status) const1238 int64_t RegexMatcher::appendGroup(int32_t groupNum, UText *dest, UErrorCode &status) const {
1239 if (U_FAILURE(status)) {
1240 return 0;
1241 }
1242 if (U_FAILURE(fDeferredStatus)) {
1243 status = fDeferredStatus;
1244 return 0;
1245 }
1246 int64_t destLen = utext_nativeLength(dest);
1247
1248 if (fMatch == FALSE) {
1249 status = U_REGEX_INVALID_STATE;
1250 return utext_replace(dest, destLen, destLen, NULL, 0, &status);
1251 }
1252 if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) {
1253 status = U_INDEX_OUTOFBOUNDS_ERROR;
1254 return utext_replace(dest, destLen, destLen, NULL, 0, &status);
1255 }
1256
1257 int64_t s, e;
1258 if (groupNum == 0) {
1259 s = fMatchStart;
1260 e = fMatchEnd;
1261 } else {
1262 int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1);
1263 U_ASSERT(groupOffset < fPattern->fFrameSize);
1264 U_ASSERT(groupOffset >= 0);
1265 s = fFrame->fExtra[groupOffset];
1266 e = fFrame->fExtra[groupOffset+1];
1267 }
1268
1269 if (s < 0) {
1270 // A capture group wasn't part of the match
1271 return utext_replace(dest, destLen, destLen, NULL, 0, &status);
1272 }
1273 U_ASSERT(s <= e);
1274
1275 int64_t deltaLen;
1276 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1277 U_ASSERT(e <= fInputLength);
1278 deltaLen = utext_replace(dest, destLen, destLen, fInputText->chunkContents+s, (int32_t)(e-s), &status);
1279 } else {
1280 int32_t len16;
1281 if (UTEXT_USES_U16(fInputText)) {
1282 len16 = (int32_t)(e-s);
1283 } else {
1284 UErrorCode lengthStatus = U_ZERO_ERROR;
1285 len16 = utext_extract(fInputText, s, e, NULL, 0, &lengthStatus);
1286 }
1287 UChar *groupChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1));
1288 if (groupChars == NULL) {
1289 status = U_MEMORY_ALLOCATION_ERROR;
1290 return 0;
1291 }
1292 utext_extract(fInputText, s, e, groupChars, len16+1, &status);
1293
1294 deltaLen = utext_replace(dest, destLen, destLen, groupChars, len16, &status);
1295 uprv_free(groupChars);
1296 }
1297 return deltaLen;
1298 }
1299
1300
1301
1302 //--------------------------------------------------------------------------------
1303 //
1304 // groupCount()
1305 //
1306 //--------------------------------------------------------------------------------
groupCount() const1307 int32_t RegexMatcher::groupCount() const {
1308 return fPattern->fGroupMap->size();
1309 }
1310
1311 //--------------------------------------------------------------------------------
1312 //
1313 // hasAnchoringBounds()
1314 //
1315 //--------------------------------------------------------------------------------
hasAnchoringBounds() const1316 UBool RegexMatcher::hasAnchoringBounds() const {
1317 return fAnchoringBounds;
1318 }
1319
1320
1321 //--------------------------------------------------------------------------------
1322 //
1323 // hasTransparentBounds()
1324 //
1325 //--------------------------------------------------------------------------------
hasTransparentBounds() const1326 UBool RegexMatcher::hasTransparentBounds() const {
1327 return fTransparentBounds;
1328 }
1329
1330
1331
1332 //--------------------------------------------------------------------------------
1333 //
1334 // hitEnd()
1335 //
1336 //--------------------------------------------------------------------------------
hitEnd() const1337 UBool RegexMatcher::hitEnd() const {
1338 return fHitEnd;
1339 }
1340
1341
1342 //--------------------------------------------------------------------------------
1343 //
1344 // input()
1345 //
1346 //--------------------------------------------------------------------------------
input() const1347 const UnicodeString &RegexMatcher::input() const {
1348 if (!fInput) {
1349 UErrorCode status = U_ZERO_ERROR;
1350 int32_t len16;
1351 if (UTEXT_USES_U16(fInputText)) {
1352 len16 = (int32_t)fInputLength;
1353 } else {
1354 len16 = utext_extract(fInputText, 0, fInputLength, NULL, 0, &status);
1355 status = U_ZERO_ERROR; // overflow, length status
1356 }
1357 UnicodeString *result = new UnicodeString(len16, 0, 0);
1358
1359 UChar *inputChars = result->getBuffer(len16);
1360 utext_extract(fInputText, 0, fInputLength, inputChars, len16, &status); // unterminated warning
1361 result->releaseBuffer(len16);
1362
1363 (*(const UnicodeString **)&fInput) = result; // pointer assignment, rather than operator=
1364 }
1365
1366 return *fInput;
1367 }
1368
1369 //--------------------------------------------------------------------------------
1370 //
1371 // inputText()
1372 //
1373 //--------------------------------------------------------------------------------
inputText() const1374 UText *RegexMatcher::inputText() const {
1375 return fInputText;
1376 }
1377
1378
1379 //--------------------------------------------------------------------------------
1380 //
1381 // getInput() -- like inputText(), but makes a clone or copies into another UText
1382 //
1383 //--------------------------------------------------------------------------------
getInput(UText * dest,UErrorCode & status) const1384 UText *RegexMatcher::getInput (UText *dest, UErrorCode &status) const {
1385 if (U_FAILURE(status)) {
1386 return dest;
1387 }
1388 if (U_FAILURE(fDeferredStatus)) {
1389 status = fDeferredStatus;
1390 return dest;
1391 }
1392
1393 if (dest) {
1394 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1395 utext_replace(dest, 0, utext_nativeLength(dest), fInputText->chunkContents, (int32_t)fInputLength, &status);
1396 } else {
1397 int32_t input16Len;
1398 if (UTEXT_USES_U16(fInputText)) {
1399 input16Len = (int32_t)fInputLength;
1400 } else {
1401 UErrorCode lengthStatus = U_ZERO_ERROR;
1402 input16Len = utext_extract(fInputText, 0, fInputLength, NULL, 0, &lengthStatus); // buffer overflow error
1403 }
1404 UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(input16Len));
1405 if (inputChars == NULL) {
1406 return dest;
1407 }
1408
1409 status = U_ZERO_ERROR;
1410 utext_extract(fInputText, 0, fInputLength, inputChars, input16Len, &status); // not terminated warning
1411 status = U_ZERO_ERROR;
1412 utext_replace(dest, 0, utext_nativeLength(dest), inputChars, input16Len, &status);
1413
1414 uprv_free(inputChars);
1415 }
1416 return dest;
1417 } else {
1418 return utext_clone(NULL, fInputText, FALSE, TRUE, &status);
1419 }
1420 }
1421
1422
1423 static UBool compat_SyncMutableUTextContents(UText *ut);
compat_SyncMutableUTextContents(UText * ut)1424 static UBool compat_SyncMutableUTextContents(UText *ut) {
1425 UBool retVal = FALSE;
1426
1427 // In the following test, we're really only interested in whether the UText should switch
1428 // between heap and stack allocation. If length hasn't changed, we won't, so the chunkContents
1429 // will still point to the correct data.
1430 if (utext_nativeLength(ut) != ut->nativeIndexingLimit) {
1431 UnicodeString *us=(UnicodeString *)ut->context;
1432
1433 // Update to the latest length.
1434 // For example, (utext_nativeLength(ut) != ut->nativeIndexingLimit).
1435 int32_t newLength = us->length();
1436
1437 // Update the chunk description.
1438 // The buffer may have switched between stack- and heap-based.
1439 ut->chunkContents = us->getBuffer();
1440 ut->chunkLength = newLength;
1441 ut->chunkNativeLimit = newLength;
1442 ut->nativeIndexingLimit = newLength;
1443 retVal = TRUE;
1444 }
1445
1446 return retVal;
1447 }
1448
1449 //--------------------------------------------------------------------------------
1450 //
1451 // lookingAt()
1452 //
1453 //--------------------------------------------------------------------------------
lookingAt(UErrorCode & status)1454 UBool RegexMatcher::lookingAt(UErrorCode &status) {
1455 if (U_FAILURE(status)) {
1456 return FALSE;
1457 }
1458 if (U_FAILURE(fDeferredStatus)) {
1459 status = fDeferredStatus;
1460 return FALSE;
1461 }
1462
1463 if (fInputUniStrMaybeMutable) {
1464 if (compat_SyncMutableUTextContents(fInputText)) {
1465 fInputLength = utext_nativeLength(fInputText);
1466 reset();
1467 }
1468 }
1469 else {
1470 resetPreserveRegion();
1471 }
1472 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1473 MatchChunkAt((int32_t)fActiveStart, FALSE, status);
1474 } else {
1475 MatchAt(fActiveStart, FALSE, status);
1476 }
1477 return fMatch;
1478 }
1479
1480
lookingAt(int64_t start,UErrorCode & status)1481 UBool RegexMatcher::lookingAt(int64_t start, UErrorCode &status) {
1482 if (U_FAILURE(status)) {
1483 return FALSE;
1484 }
1485 if (U_FAILURE(fDeferredStatus)) {
1486 status = fDeferredStatus;
1487 return FALSE;
1488 }
1489 reset();
1490
1491 if (start < 0) {
1492 status = U_INDEX_OUTOFBOUNDS_ERROR;
1493 return FALSE;
1494 }
1495
1496 if (fInputUniStrMaybeMutable) {
1497 if (compat_SyncMutableUTextContents(fInputText)) {
1498 fInputLength = utext_nativeLength(fInputText);
1499 reset();
1500 }
1501 }
1502
1503 int64_t nativeStart;
1504 nativeStart = start;
1505 if (nativeStart < fActiveStart || nativeStart > fActiveLimit) {
1506 status = U_INDEX_OUTOFBOUNDS_ERROR;
1507 return FALSE;
1508 }
1509
1510 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1511 MatchChunkAt((int32_t)nativeStart, FALSE, status);
1512 } else {
1513 MatchAt(nativeStart, FALSE, status);
1514 }
1515 return fMatch;
1516 }
1517
1518
1519
1520 //--------------------------------------------------------------------------------
1521 //
1522 // matches()
1523 //
1524 //--------------------------------------------------------------------------------
matches(UErrorCode & status)1525 UBool RegexMatcher::matches(UErrorCode &status) {
1526 if (U_FAILURE(status)) {
1527 return FALSE;
1528 }
1529 if (U_FAILURE(fDeferredStatus)) {
1530 status = fDeferredStatus;
1531 return FALSE;
1532 }
1533
1534 if (fInputUniStrMaybeMutable) {
1535 if (compat_SyncMutableUTextContents(fInputText)) {
1536 fInputLength = utext_nativeLength(fInputText);
1537 reset();
1538 }
1539 }
1540 else {
1541 resetPreserveRegion();
1542 }
1543
1544 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1545 MatchChunkAt((int32_t)fActiveStart, TRUE, status);
1546 } else {
1547 MatchAt(fActiveStart, TRUE, status);
1548 }
1549 return fMatch;
1550 }
1551
1552
matches(int64_t start,UErrorCode & status)1553 UBool RegexMatcher::matches(int64_t start, UErrorCode &status) {
1554 if (U_FAILURE(status)) {
1555 return FALSE;
1556 }
1557 if (U_FAILURE(fDeferredStatus)) {
1558 status = fDeferredStatus;
1559 return FALSE;
1560 }
1561 reset();
1562
1563 if (start < 0) {
1564 status = U_INDEX_OUTOFBOUNDS_ERROR;
1565 return FALSE;
1566 }
1567
1568 if (fInputUniStrMaybeMutable) {
1569 if (compat_SyncMutableUTextContents(fInputText)) {
1570 fInputLength = utext_nativeLength(fInputText);
1571 reset();
1572 }
1573 }
1574
1575 int64_t nativeStart;
1576 nativeStart = start;
1577 if (nativeStart < fActiveStart || nativeStart > fActiveLimit) {
1578 status = U_INDEX_OUTOFBOUNDS_ERROR;
1579 return FALSE;
1580 }
1581
1582 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1583 MatchChunkAt((int32_t)nativeStart, TRUE, status);
1584 } else {
1585 MatchAt(nativeStart, TRUE, status);
1586 }
1587 return fMatch;
1588 }
1589
1590
1591
1592 //--------------------------------------------------------------------------------
1593 //
1594 // pattern
1595 //
1596 //--------------------------------------------------------------------------------
pattern() const1597 const RegexPattern &RegexMatcher::pattern() const {
1598 return *fPattern;
1599 }
1600
1601
1602
1603 //--------------------------------------------------------------------------------
1604 //
1605 // region
1606 //
1607 //--------------------------------------------------------------------------------
region(int64_t regionStart,int64_t regionLimit,int64_t startIndex,UErrorCode & status)1608 RegexMatcher &RegexMatcher::region(int64_t regionStart, int64_t regionLimit, int64_t startIndex, UErrorCode &status) {
1609 if (U_FAILURE(status)) {
1610 return *this;
1611 }
1612
1613 if (regionStart>regionLimit || regionStart<0 || regionLimit<0) {
1614 status = U_ILLEGAL_ARGUMENT_ERROR;
1615 }
1616
1617 int64_t nativeStart = regionStart;
1618 int64_t nativeLimit = regionLimit;
1619 if (nativeStart > fInputLength || nativeLimit > fInputLength) {
1620 status = U_ILLEGAL_ARGUMENT_ERROR;
1621 }
1622
1623 if (startIndex == -1)
1624 this->reset();
1625 else
1626 resetPreserveRegion();
1627
1628 fRegionStart = nativeStart;
1629 fRegionLimit = nativeLimit;
1630 fActiveStart = nativeStart;
1631 fActiveLimit = nativeLimit;
1632
1633 if (startIndex != -1) {
1634 if (startIndex < fActiveStart || startIndex > fActiveLimit) {
1635 status = U_INDEX_OUTOFBOUNDS_ERROR;
1636 }
1637 fMatchEnd = startIndex;
1638 }
1639
1640 if (!fTransparentBounds) {
1641 fLookStart = nativeStart;
1642 fLookLimit = nativeLimit;
1643 }
1644 if (fAnchoringBounds) {
1645 fAnchorStart = nativeStart;
1646 fAnchorLimit = nativeLimit;
1647 }
1648 return *this;
1649 }
1650
region(int64_t start,int64_t limit,UErrorCode & status)1651 RegexMatcher &RegexMatcher::region(int64_t start, int64_t limit, UErrorCode &status) {
1652 return region(start, limit, -1, status);
1653 }
1654
1655 //--------------------------------------------------------------------------------
1656 //
1657 // regionEnd
1658 //
1659 //--------------------------------------------------------------------------------
regionEnd() const1660 int32_t RegexMatcher::regionEnd() const {
1661 return (int32_t)fRegionLimit;
1662 }
1663
regionEnd64() const1664 int64_t RegexMatcher::regionEnd64() const {
1665 return fRegionLimit;
1666 }
1667
1668 //--------------------------------------------------------------------------------
1669 //
1670 // regionStart
1671 //
1672 //--------------------------------------------------------------------------------
regionStart() const1673 int32_t RegexMatcher::regionStart() const {
1674 return (int32_t)fRegionStart;
1675 }
1676
regionStart64() const1677 int64_t RegexMatcher::regionStart64() const {
1678 return fRegionStart;
1679 }
1680
1681
1682 //--------------------------------------------------------------------------------
1683 //
1684 // replaceAll
1685 //
1686 //--------------------------------------------------------------------------------
replaceAll(const UnicodeString & replacement,UErrorCode & status)1687 UnicodeString RegexMatcher::replaceAll(const UnicodeString &replacement, UErrorCode &status) {
1688 UText replacementText = UTEXT_INITIALIZER;
1689 UText resultText = UTEXT_INITIALIZER;
1690 UnicodeString resultString;
1691 if (U_FAILURE(status)) {
1692 return resultString;
1693 }
1694
1695 utext_openConstUnicodeString(&replacementText, &replacement, &status);
1696 utext_openUnicodeString(&resultText, &resultString, &status);
1697
1698 replaceAll(&replacementText, &resultText, status);
1699
1700 utext_close(&resultText);
1701 utext_close(&replacementText);
1702
1703 return resultString;
1704 }
1705
1706
1707 //
1708 // replaceAll, UText mode
1709 //
replaceAll(UText * replacement,UText * dest,UErrorCode & status)1710 UText *RegexMatcher::replaceAll(UText *replacement, UText *dest, UErrorCode &status) {
1711 if (U_FAILURE(status)) {
1712 return dest;
1713 }
1714 if (U_FAILURE(fDeferredStatus)) {
1715 status = fDeferredStatus;
1716 return dest;
1717 }
1718
1719 if (dest == NULL) {
1720 UnicodeString emptyString;
1721 UText empty = UTEXT_INITIALIZER;
1722
1723 utext_openUnicodeString(&empty, &emptyString, &status);
1724 dest = utext_clone(NULL, &empty, TRUE, FALSE, &status);
1725 utext_close(&empty);
1726 }
1727
1728 if (U_SUCCESS(status)) {
1729 reset();
1730 while (find()) {
1731 appendReplacement(dest, replacement, status);
1732 if (U_FAILURE(status)) {
1733 break;
1734 }
1735 }
1736 appendTail(dest, status);
1737 }
1738
1739 return dest;
1740 }
1741
1742
1743 //--------------------------------------------------------------------------------
1744 //
1745 // replaceFirst
1746 //
1747 //--------------------------------------------------------------------------------
replaceFirst(const UnicodeString & replacement,UErrorCode & status)1748 UnicodeString RegexMatcher::replaceFirst(const UnicodeString &replacement, UErrorCode &status) {
1749 UText replacementText = UTEXT_INITIALIZER;
1750 UText resultText = UTEXT_INITIALIZER;
1751 UnicodeString resultString;
1752
1753 utext_openConstUnicodeString(&replacementText, &replacement, &status);
1754 utext_openUnicodeString(&resultText, &resultString, &status);
1755
1756 replaceFirst(&replacementText, &resultText, status);
1757
1758 utext_close(&resultText);
1759 utext_close(&replacementText);
1760
1761 return resultString;
1762 }
1763
1764 //
1765 // replaceFirst, UText mode
1766 //
replaceFirst(UText * replacement,UText * dest,UErrorCode & status)1767 UText *RegexMatcher::replaceFirst(UText *replacement, UText *dest, UErrorCode &status) {
1768 if (U_FAILURE(status)) {
1769 return dest;
1770 }
1771 if (U_FAILURE(fDeferredStatus)) {
1772 status = fDeferredStatus;
1773 return dest;
1774 }
1775
1776 reset();
1777 if (!find()) {
1778 return getInput(dest, status);
1779 }
1780
1781 if (dest == NULL) {
1782 UnicodeString emptyString;
1783 UText empty = UTEXT_INITIALIZER;
1784
1785 utext_openUnicodeString(&empty, &emptyString, &status);
1786 dest = utext_clone(NULL, &empty, TRUE, FALSE, &status);
1787 utext_close(&empty);
1788 }
1789
1790 appendReplacement(dest, replacement, status);
1791 appendTail(dest, status);
1792
1793 return dest;
1794 }
1795
1796
1797 //--------------------------------------------------------------------------------
1798 //
1799 // requireEnd
1800 //
1801 //--------------------------------------------------------------------------------
requireEnd() const1802 UBool RegexMatcher::requireEnd() const {
1803 return fRequireEnd;
1804 }
1805
1806
1807 //--------------------------------------------------------------------------------
1808 //
1809 // reset
1810 //
1811 //--------------------------------------------------------------------------------
reset()1812 RegexMatcher &RegexMatcher::reset() {
1813 fRegionStart = 0;
1814 fRegionLimit = fInputLength;
1815 fActiveStart = 0;
1816 fActiveLimit = fInputLength;
1817 fAnchorStart = 0;
1818 fAnchorLimit = fInputLength;
1819 fLookStart = 0;
1820 fLookLimit = fInputLength;
1821 resetPreserveRegion();
1822 return *this;
1823 }
1824
1825
1826
resetPreserveRegion()1827 void RegexMatcher::resetPreserveRegion() {
1828 fMatchStart = 0;
1829 fMatchEnd = 0;
1830 fLastMatchEnd = -1;
1831 fAppendPosition = 0;
1832 fMatch = FALSE;
1833 fHitEnd = FALSE;
1834 fRequireEnd = FALSE;
1835 fTime = 0;
1836 fTickCounter = TIMER_INITIAL_VALUE;
1837 //resetStack(); // more expensive than it looks...
1838 }
1839
1840
reset(const UnicodeString & input)1841 RegexMatcher &RegexMatcher::reset(const UnicodeString &input) {
1842 fInputText = utext_openConstUnicodeString(fInputText, &input, &fDeferredStatus);
1843 if (fPattern->fNeedsAltInput) {
1844 fAltInputText = utext_clone(fAltInputText, fInputText, FALSE, TRUE, &fDeferredStatus);
1845 }
1846 if (U_FAILURE(fDeferredStatus)) {
1847 return *this;
1848 }
1849 fInputLength = utext_nativeLength(fInputText);
1850
1851 reset();
1852 delete fInput;
1853 fInput = NULL;
1854
1855 // Do the following for any UnicodeString.
1856 // This is for compatibility for those clients who modify the input string "live" during regex operations.
1857 fInputUniStrMaybeMutable = TRUE;
1858
1859 if (fWordBreakItr != NULL) {
1860 #if UCONFIG_NO_BREAK_ITERATION==0
1861 UErrorCode status = U_ZERO_ERROR;
1862 fWordBreakItr->setText(fInputText, status);
1863 #endif
1864 }
1865 return *this;
1866 }
1867
1868
reset(UText * input)1869 RegexMatcher &RegexMatcher::reset(UText *input) {
1870 if (fInputText != input) {
1871 fInputText = utext_clone(fInputText, input, FALSE, TRUE, &fDeferredStatus);
1872 if (fPattern->fNeedsAltInput) fAltInputText = utext_clone(fAltInputText, fInputText, FALSE, TRUE, &fDeferredStatus);
1873 if (U_FAILURE(fDeferredStatus)) {
1874 return *this;
1875 }
1876 fInputLength = utext_nativeLength(fInputText);
1877
1878 delete fInput;
1879 fInput = NULL;
1880
1881 if (fWordBreakItr != NULL) {
1882 #if UCONFIG_NO_BREAK_ITERATION==0
1883 UErrorCode status = U_ZERO_ERROR;
1884 fWordBreakItr->setText(input, status);
1885 #endif
1886 }
1887 }
1888 reset();
1889 fInputUniStrMaybeMutable = FALSE;
1890
1891 return *this;
1892 }
1893
1894 /*RegexMatcher &RegexMatcher::reset(const UChar *) {
1895 fDeferredStatus = U_INTERNAL_PROGRAM_ERROR;
1896 return *this;
1897 }*/
1898
reset(int64_t position,UErrorCode & status)1899 RegexMatcher &RegexMatcher::reset(int64_t position, UErrorCode &status) {
1900 if (U_FAILURE(status)) {
1901 return *this;
1902 }
1903 reset(); // Reset also resets the region to be the entire string.
1904
1905 if (position < 0 || position > fActiveLimit) {
1906 status = U_INDEX_OUTOFBOUNDS_ERROR;
1907 return *this;
1908 }
1909 fMatchEnd = position;
1910 return *this;
1911 }
1912
1913
1914 //--------------------------------------------------------------------------------
1915 //
1916 // refresh
1917 //
1918 //--------------------------------------------------------------------------------
refreshInputText(UText * input,UErrorCode & status)1919 RegexMatcher &RegexMatcher::refreshInputText(UText *input, UErrorCode &status) {
1920 if (U_FAILURE(status)) {
1921 return *this;
1922 }
1923 if (input == NULL) {
1924 status = U_ILLEGAL_ARGUMENT_ERROR;
1925 return *this;
1926 }
1927 if (utext_nativeLength(fInputText) != utext_nativeLength(input)) {
1928 status = U_ILLEGAL_ARGUMENT_ERROR;
1929 return *this;
1930 }
1931 int64_t pos = utext_getNativeIndex(fInputText);
1932 // Shallow read-only clone of the new UText into the existing input UText
1933 fInputText = utext_clone(fInputText, input, FALSE, TRUE, &status);
1934 if (U_FAILURE(status)) {
1935 return *this;
1936 }
1937 utext_setNativeIndex(fInputText, pos);
1938
1939 if (fAltInputText != NULL) {
1940 pos = utext_getNativeIndex(fAltInputText);
1941 fAltInputText = utext_clone(fAltInputText, input, FALSE, TRUE, &status);
1942 if (U_FAILURE(status)) {
1943 return *this;
1944 }
1945 utext_setNativeIndex(fAltInputText, pos);
1946 }
1947 return *this;
1948 }
1949
1950
1951
1952 //--------------------------------------------------------------------------------
1953 //
1954 // setTrace
1955 //
1956 //--------------------------------------------------------------------------------
setTrace(UBool state)1957 void RegexMatcher::setTrace(UBool state) {
1958 fTraceDebug = state;
1959 }
1960
1961
1962
1963 /**
1964 * UText, replace entire contents of the destination UText with a substring of the source UText.
1965 *
1966 * @param src The source UText
1967 * @param dest The destination UText. Must be writable.
1968 * May be NULL, in which case a new UText will be allocated.
1969 * @param start Start index of source substring.
1970 * @param limit Limit index of source substring.
1971 * @param status An error code.
1972 */
utext_extract_replace(UText * src,UText * dest,int64_t start,int64_t limit,UErrorCode * status)1973 static UText *utext_extract_replace(UText *src, UText *dest, int64_t start, int64_t limit, UErrorCode *status) {
1974 if (U_FAILURE(*status)) {
1975 return dest;
1976 }
1977 if (start == limit) {
1978 if (dest) {
1979 utext_replace(dest, 0, utext_nativeLength(dest), NULL, 0, status);
1980 return dest;
1981 } else {
1982 return utext_openUChars(NULL, NULL, 0, status);
1983 }
1984 }
1985 int32_t length = utext_extract(src, start, limit, NULL, 0, status);
1986 if (*status != U_BUFFER_OVERFLOW_ERROR && U_FAILURE(*status)) {
1987 return dest;
1988 }
1989 *status = U_ZERO_ERROR;
1990 MaybeStackArray<UChar, 40> buffer;
1991 if (length >= buffer.getCapacity()) {
1992 UChar *newBuf = buffer.resize(length+1); // Leave space for terminating Nul.
1993 if (newBuf == NULL) {
1994 *status = U_MEMORY_ALLOCATION_ERROR;
1995 }
1996 }
1997 utext_extract(src, start, limit, buffer.getAlias(), length+1, status);
1998 if (dest) {
1999 utext_replace(dest, 0, utext_nativeLength(dest), buffer.getAlias(), length, status);
2000 return dest;
2001 }
2002
2003 // Caller did not provide a prexisting UText.
2004 // Open a new one, and have it adopt the text buffer storage.
2005 if (U_FAILURE(*status)) {
2006 return NULL;
2007 }
2008 int32_t ownedLength = 0;
2009 UChar *ownedBuf = buffer.orphanOrClone(length+1, ownedLength);
2010 if (ownedBuf == NULL) {
2011 *status = U_MEMORY_ALLOCATION_ERROR;
2012 return NULL;
2013 }
2014 UText *result = utext_openUChars(NULL, ownedBuf, length, status);
2015 if (U_FAILURE(*status)) {
2016 uprv_free(ownedBuf);
2017 return NULL;
2018 }
2019 result->providerProperties |= (1 << UTEXT_PROVIDER_OWNS_TEXT);
2020 return result;
2021 }
2022
2023
2024 //---------------------------------------------------------------------
2025 //
2026 // split
2027 //
2028 //---------------------------------------------------------------------
split(const UnicodeString & input,UnicodeString dest[],int32_t destCapacity,UErrorCode & status)2029 int32_t RegexMatcher::split(const UnicodeString &input,
2030 UnicodeString dest[],
2031 int32_t destCapacity,
2032 UErrorCode &status)
2033 {
2034 UText inputText = UTEXT_INITIALIZER;
2035 utext_openConstUnicodeString(&inputText, &input, &status);
2036 if (U_FAILURE(status)) {
2037 return 0;
2038 }
2039
2040 UText **destText = (UText **)uprv_malloc(sizeof(UText*)*destCapacity);
2041 if (destText == NULL) {
2042 status = U_MEMORY_ALLOCATION_ERROR;
2043 return 0;
2044 }
2045 int32_t i;
2046 for (i = 0; i < destCapacity; i++) {
2047 destText[i] = utext_openUnicodeString(NULL, &dest[i], &status);
2048 }
2049
2050 int32_t fieldCount = split(&inputText, destText, destCapacity, status);
2051
2052 for (i = 0; i < destCapacity; i++) {
2053 utext_close(destText[i]);
2054 }
2055
2056 uprv_free(destText);
2057 utext_close(&inputText);
2058 return fieldCount;
2059 }
2060
2061 //
2062 // split, UText mode
2063 //
split(UText * input,UText * dest[],int32_t destCapacity,UErrorCode & status)2064 int32_t RegexMatcher::split(UText *input,
2065 UText *dest[],
2066 int32_t destCapacity,
2067 UErrorCode &status)
2068 {
2069 //
2070 // Check arguements for validity
2071 //
2072 if (U_FAILURE(status)) {
2073 return 0;
2074 };
2075
2076 if (destCapacity < 1) {
2077 status = U_ILLEGAL_ARGUMENT_ERROR;
2078 return 0;
2079 }
2080
2081 //
2082 // Reset for the input text
2083 //
2084 reset(input);
2085 int64_t nextOutputStringStart = 0;
2086 if (fActiveLimit == 0) {
2087 return 0;
2088 }
2089
2090 //
2091 // Loop through the input text, searching for the delimiter pattern
2092 //
2093 int32_t i;
2094 int32_t numCaptureGroups = fPattern->fGroupMap->size();
2095 for (i=0; ; i++) {
2096 if (i>=destCapacity-1) {
2097 // There is one or zero output string left.
2098 // Fill the last output string with whatever is left from the input, then exit the loop.
2099 // ( i will be == destCapacity if we filled the output array while processing
2100 // capture groups of the delimiter expression, in which case we will discard the
2101 // last capture group saved in favor of the unprocessed remainder of the
2102 // input string.)
2103 i = destCapacity-1;
2104 if (fActiveLimit > nextOutputStringStart) {
2105 if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) {
2106 if (dest[i]) {
2107 utext_replace(dest[i], 0, utext_nativeLength(dest[i]),
2108 input->chunkContents+nextOutputStringStart,
2109 (int32_t)(fActiveLimit-nextOutputStringStart), &status);
2110 } else {
2111 UText remainingText = UTEXT_INITIALIZER;
2112 utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart,
2113 fActiveLimit-nextOutputStringStart, &status);
2114 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2115 utext_close(&remainingText);
2116 }
2117 } else {
2118 UErrorCode lengthStatus = U_ZERO_ERROR;
2119 int32_t remaining16Length =
2120 utext_extract(input, nextOutputStringStart, fActiveLimit, NULL, 0, &lengthStatus);
2121 UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1));
2122 if (remainingChars == NULL) {
2123 status = U_MEMORY_ALLOCATION_ERROR;
2124 break;
2125 }
2126
2127 utext_extract(input, nextOutputStringStart, fActiveLimit, remainingChars, remaining16Length+1, &status);
2128 if (dest[i]) {
2129 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status);
2130 } else {
2131 UText remainingText = UTEXT_INITIALIZER;
2132 utext_openUChars(&remainingText, remainingChars, remaining16Length, &status);
2133 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2134 utext_close(&remainingText);
2135 }
2136
2137 uprv_free(remainingChars);
2138 }
2139 }
2140 break;
2141 }
2142 if (find()) {
2143 // We found another delimiter. Move everything from where we started looking
2144 // up until the start of the delimiter into the next output string.
2145 if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) {
2146 if (dest[i]) {
2147 utext_replace(dest[i], 0, utext_nativeLength(dest[i]),
2148 input->chunkContents+nextOutputStringStart,
2149 (int32_t)(fMatchStart-nextOutputStringStart), &status);
2150 } else {
2151 UText remainingText = UTEXT_INITIALIZER;
2152 utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart,
2153 fMatchStart-nextOutputStringStart, &status);
2154 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2155 utext_close(&remainingText);
2156 }
2157 } else {
2158 UErrorCode lengthStatus = U_ZERO_ERROR;
2159 int32_t remaining16Length = utext_extract(input, nextOutputStringStart, fMatchStart, NULL, 0, &lengthStatus);
2160 UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1));
2161 if (remainingChars == NULL) {
2162 status = U_MEMORY_ALLOCATION_ERROR;
2163 break;
2164 }
2165 utext_extract(input, nextOutputStringStart, fMatchStart, remainingChars, remaining16Length+1, &status);
2166 if (dest[i]) {
2167 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status);
2168 } else {
2169 UText remainingText = UTEXT_INITIALIZER;
2170 utext_openUChars(&remainingText, remainingChars, remaining16Length, &status);
2171 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2172 utext_close(&remainingText);
2173 }
2174
2175 uprv_free(remainingChars);
2176 }
2177 nextOutputStringStart = fMatchEnd;
2178
2179 // If the delimiter pattern has capturing parentheses, the captured
2180 // text goes out into the next n destination strings.
2181 int32_t groupNum;
2182 for (groupNum=1; groupNum<=numCaptureGroups; groupNum++) {
2183 if (i >= destCapacity-2) {
2184 // Never fill the last available output string with capture group text.
2185 // It will filled with the last field, the remainder of the
2186 // unsplit input text.
2187 break;
2188 }
2189 i++;
2190 dest[i] = utext_extract_replace(fInputText, dest[i],
2191 start64(groupNum, status), end64(groupNum, status), &status);
2192 }
2193
2194 if (nextOutputStringStart == fActiveLimit) {
2195 // The delimiter was at the end of the string. We're done, but first
2196 // we output one last empty string, for the empty field following
2197 // the delimiter at the end of input.
2198 if (i+1 < destCapacity) {
2199 ++i;
2200 if (dest[i] == NULL) {
2201 dest[i] = utext_openUChars(NULL, NULL, 0, &status);
2202 } else {
2203 static const UChar emptyString[] = {(UChar)0};
2204 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), emptyString, 0, &status);
2205 }
2206 }
2207 break;
2208
2209 }
2210 }
2211 else
2212 {
2213 // We ran off the end of the input while looking for the next delimiter.
2214 // All the remaining text goes into the current output string.
2215 if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) {
2216 if (dest[i]) {
2217 utext_replace(dest[i], 0, utext_nativeLength(dest[i]),
2218 input->chunkContents+nextOutputStringStart,
2219 (int32_t)(fActiveLimit-nextOutputStringStart), &status);
2220 } else {
2221 UText remainingText = UTEXT_INITIALIZER;
2222 utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart,
2223 fActiveLimit-nextOutputStringStart, &status);
2224 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2225 utext_close(&remainingText);
2226 }
2227 } else {
2228 UErrorCode lengthStatus = U_ZERO_ERROR;
2229 int32_t remaining16Length = utext_extract(input, nextOutputStringStart, fActiveLimit, NULL, 0, &lengthStatus);
2230 UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1));
2231 if (remainingChars == NULL) {
2232 status = U_MEMORY_ALLOCATION_ERROR;
2233 break;
2234 }
2235
2236 utext_extract(input, nextOutputStringStart, fActiveLimit, remainingChars, remaining16Length+1, &status);
2237 if (dest[i]) {
2238 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status);
2239 } else {
2240 UText remainingText = UTEXT_INITIALIZER;
2241 utext_openUChars(&remainingText, remainingChars, remaining16Length, &status);
2242 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2243 utext_close(&remainingText);
2244 }
2245
2246 uprv_free(remainingChars);
2247 }
2248 break;
2249 }
2250 if (U_FAILURE(status)) {
2251 break;
2252 }
2253 } // end of for loop
2254 return i+1;
2255 }
2256
2257
2258 //--------------------------------------------------------------------------------
2259 //
2260 // start
2261 //
2262 //--------------------------------------------------------------------------------
start(UErrorCode & status) const2263 int32_t RegexMatcher::start(UErrorCode &status) const {
2264 return start(0, status);
2265 }
2266
start64(UErrorCode & status) const2267 int64_t RegexMatcher::start64(UErrorCode &status) const {
2268 return start64(0, status);
2269 }
2270
2271 //--------------------------------------------------------------------------------
2272 //
2273 // start(int32_t group, UErrorCode &status)
2274 //
2275 //--------------------------------------------------------------------------------
2276
start64(int32_t group,UErrorCode & status) const2277 int64_t RegexMatcher::start64(int32_t group, UErrorCode &status) const {
2278 if (U_FAILURE(status)) {
2279 return -1;
2280 }
2281 if (U_FAILURE(fDeferredStatus)) {
2282 status = fDeferredStatus;
2283 return -1;
2284 }
2285 if (fMatch == FALSE) {
2286 status = U_REGEX_INVALID_STATE;
2287 return -1;
2288 }
2289 if (group < 0 || group > fPattern->fGroupMap->size()) {
2290 status = U_INDEX_OUTOFBOUNDS_ERROR;
2291 return -1;
2292 }
2293 int64_t s;
2294 if (group == 0) {
2295 s = fMatchStart;
2296 } else {
2297 int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1);
2298 U_ASSERT(groupOffset < fPattern->fFrameSize);
2299 U_ASSERT(groupOffset >= 0);
2300 s = fFrame->fExtra[groupOffset];
2301 }
2302
2303 return s;
2304 }
2305
2306
start(int32_t group,UErrorCode & status) const2307 int32_t RegexMatcher::start(int32_t group, UErrorCode &status) const {
2308 return (int32_t)start64(group, status);
2309 }
2310
2311 //--------------------------------------------------------------------------------
2312 //
2313 // useAnchoringBounds
2314 //
2315 //--------------------------------------------------------------------------------
useAnchoringBounds(UBool b)2316 RegexMatcher &RegexMatcher::useAnchoringBounds(UBool b) {
2317 fAnchoringBounds = b;
2318 fAnchorStart = (fAnchoringBounds ? fRegionStart : 0);
2319 fAnchorLimit = (fAnchoringBounds ? fRegionLimit : fInputLength);
2320 return *this;
2321 }
2322
2323
2324 //--------------------------------------------------------------------------------
2325 //
2326 // useTransparentBounds
2327 //
2328 //--------------------------------------------------------------------------------
useTransparentBounds(UBool b)2329 RegexMatcher &RegexMatcher::useTransparentBounds(UBool b) {
2330 fTransparentBounds = b;
2331 fLookStart = (fTransparentBounds ? 0 : fRegionStart);
2332 fLookLimit = (fTransparentBounds ? fInputLength : fRegionLimit);
2333 return *this;
2334 }
2335
2336 //--------------------------------------------------------------------------------
2337 //
2338 // setTimeLimit
2339 //
2340 //--------------------------------------------------------------------------------
setTimeLimit(int32_t limit,UErrorCode & status)2341 void RegexMatcher::setTimeLimit(int32_t limit, UErrorCode &status) {
2342 if (U_FAILURE(status)) {
2343 return;
2344 }
2345 if (U_FAILURE(fDeferredStatus)) {
2346 status = fDeferredStatus;
2347 return;
2348 }
2349 if (limit < 0) {
2350 status = U_ILLEGAL_ARGUMENT_ERROR;
2351 return;
2352 }
2353 fTimeLimit = limit;
2354 }
2355
2356
2357 //--------------------------------------------------------------------------------
2358 //
2359 // getTimeLimit
2360 //
2361 //--------------------------------------------------------------------------------
getTimeLimit() const2362 int32_t RegexMatcher::getTimeLimit() const {
2363 return fTimeLimit;
2364 }
2365
2366
2367 //--------------------------------------------------------------------------------
2368 //
2369 // setStackLimit
2370 //
2371 //--------------------------------------------------------------------------------
setStackLimit(int32_t limit,UErrorCode & status)2372 void RegexMatcher::setStackLimit(int32_t limit, UErrorCode &status) {
2373 if (U_FAILURE(status)) {
2374 return;
2375 }
2376 if (U_FAILURE(fDeferredStatus)) {
2377 status = fDeferredStatus;
2378 return;
2379 }
2380 if (limit < 0) {
2381 status = U_ILLEGAL_ARGUMENT_ERROR;
2382 return;
2383 }
2384
2385 // Reset the matcher. This is needed here in case there is a current match
2386 // whose final stack frame (containing the match results, pointed to by fFrame)
2387 // would be lost by resizing to a smaller stack size.
2388 reset();
2389
2390 if (limit == 0) {
2391 // Unlimited stack expansion
2392 fStack->setMaxCapacity(0);
2393 } else {
2394 // Change the units of the limit from bytes to ints, and bump the size up
2395 // to be big enough to hold at least one stack frame for the pattern,
2396 // if it isn't there already.
2397 int32_t adjustedLimit = limit / sizeof(int32_t);
2398 if (adjustedLimit < fPattern->fFrameSize) {
2399 adjustedLimit = fPattern->fFrameSize;
2400 }
2401 fStack->setMaxCapacity(adjustedLimit);
2402 }
2403 fStackLimit = limit;
2404 }
2405
2406
2407 //--------------------------------------------------------------------------------
2408 //
2409 // getStackLimit
2410 //
2411 //--------------------------------------------------------------------------------
getStackLimit() const2412 int32_t RegexMatcher::getStackLimit() const {
2413 return fStackLimit;
2414 }
2415
2416
2417 //--------------------------------------------------------------------------------
2418 //
2419 // setMatchCallback
2420 //
2421 //--------------------------------------------------------------------------------
setMatchCallback(URegexMatchCallback * callback,const void * context,UErrorCode & status)2422 void RegexMatcher::setMatchCallback(URegexMatchCallback *callback,
2423 const void *context,
2424 UErrorCode &status) {
2425 if (U_FAILURE(status)) {
2426 return;
2427 }
2428 fCallbackFn = callback;
2429 fCallbackContext = context;
2430 }
2431
2432
2433 //--------------------------------------------------------------------------------
2434 //
2435 // getMatchCallback
2436 //
2437 //--------------------------------------------------------------------------------
getMatchCallback(URegexMatchCallback * & callback,const void * & context,UErrorCode & status)2438 void RegexMatcher::getMatchCallback(URegexMatchCallback *&callback,
2439 const void *&context,
2440 UErrorCode &status) {
2441 if (U_FAILURE(status)) {
2442 return;
2443 }
2444 callback = fCallbackFn;
2445 context = fCallbackContext;
2446 }
2447
2448
2449 //--------------------------------------------------------------------------------
2450 //
2451 // setMatchCallback
2452 //
2453 //--------------------------------------------------------------------------------
setFindProgressCallback(URegexFindProgressCallback * callback,const void * context,UErrorCode & status)2454 void RegexMatcher::setFindProgressCallback(URegexFindProgressCallback *callback,
2455 const void *context,
2456 UErrorCode &status) {
2457 if (U_FAILURE(status)) {
2458 return;
2459 }
2460 fFindProgressCallbackFn = callback;
2461 fFindProgressCallbackContext = context;
2462 }
2463
2464
2465 //--------------------------------------------------------------------------------
2466 //
2467 // getMatchCallback
2468 //
2469 //--------------------------------------------------------------------------------
getFindProgressCallback(URegexFindProgressCallback * & callback,const void * & context,UErrorCode & status)2470 void RegexMatcher::getFindProgressCallback(URegexFindProgressCallback *&callback,
2471 const void *&context,
2472 UErrorCode &status) {
2473 if (U_FAILURE(status)) {
2474 return;
2475 }
2476 callback = fFindProgressCallbackFn;
2477 context = fFindProgressCallbackContext;
2478 }
2479
2480
2481 //================================================================================
2482 //
2483 // Code following this point in this file is the internal
2484 // Match Engine Implementation.
2485 //
2486 //================================================================================
2487
2488
2489 //--------------------------------------------------------------------------------
2490 //
2491 // resetStack
2492 // Discard any previous contents of the state save stack, and initialize a
2493 // new stack frame to all -1. The -1s are needed for capture group limits,
2494 // where they indicate that a group has not yet matched anything.
2495 //--------------------------------------------------------------------------------
resetStack()2496 REStackFrame *RegexMatcher::resetStack() {
2497 // Discard any previous contents of the state save stack, and initialize a
2498 // new stack frame with all -1 data. The -1s are needed for capture group limits,
2499 // where they indicate that a group has not yet matched anything.
2500 fStack->removeAllElements();
2501
2502 REStackFrame *iFrame = (REStackFrame *)fStack->reserveBlock(fPattern->fFrameSize, fDeferredStatus);
2503 if(U_FAILURE(fDeferredStatus)) {
2504 return NULL;
2505 }
2506
2507 int32_t i;
2508 for (i=0; i<fPattern->fFrameSize-RESTACKFRAME_HDRCOUNT; i++) {
2509 iFrame->fExtra[i] = -1;
2510 }
2511 return iFrame;
2512 }
2513
2514
2515
2516 //--------------------------------------------------------------------------------
2517 //
2518 // isWordBoundary
2519 // in perl, "xab..cd..", \b is true at positions 0,3,5,7
2520 // For us,
2521 // If the current char is a combining mark,
2522 // \b is FALSE.
2523 // Else Scan backwards to the first non-combining char.
2524 // We are at a boundary if the this char and the original chars are
2525 // opposite in membership in \w set
2526 //
2527 // parameters: pos - the current position in the input buffer
2528 //
2529 // TODO: double-check edge cases at region boundaries.
2530 //
2531 //--------------------------------------------------------------------------------
isWordBoundary(int64_t pos)2532 UBool RegexMatcher::isWordBoundary(int64_t pos) {
2533 UBool isBoundary = FALSE;
2534 UBool cIsWord = FALSE;
2535
2536 if (pos >= fLookLimit) {
2537 fHitEnd = TRUE;
2538 } else {
2539 // Determine whether char c at current position is a member of the word set of chars.
2540 // If we're off the end of the string, behave as though we're not at a word char.
2541 UTEXT_SETNATIVEINDEX(fInputText, pos);
2542 UChar32 c = UTEXT_CURRENT32(fInputText);
2543 if (u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND) || u_charType(c) == U_FORMAT_CHAR) {
2544 // Current char is a combining one. Not a boundary.
2545 return FALSE;
2546 }
2547 cIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(c);
2548 }
2549
2550 // Back up until we come to a non-combining char, determine whether
2551 // that char is a word char.
2552 UBool prevCIsWord = FALSE;
2553 for (;;) {
2554 if (UTEXT_GETNATIVEINDEX(fInputText) <= fLookStart) {
2555 break;
2556 }
2557 UChar32 prevChar = UTEXT_PREVIOUS32(fInputText);
2558 if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND)
2559 || u_charType(prevChar) == U_FORMAT_CHAR)) {
2560 prevCIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(prevChar);
2561 break;
2562 }
2563 }
2564 isBoundary = cIsWord ^ prevCIsWord;
2565 return isBoundary;
2566 }
2567
isChunkWordBoundary(int32_t pos)2568 UBool RegexMatcher::isChunkWordBoundary(int32_t pos) {
2569 UBool isBoundary = FALSE;
2570 UBool cIsWord = FALSE;
2571
2572 const UChar *inputBuf = fInputText->chunkContents;
2573
2574 if (pos >= fLookLimit) {
2575 fHitEnd = TRUE;
2576 } else {
2577 // Determine whether char c at current position is a member of the word set of chars.
2578 // If we're off the end of the string, behave as though we're not at a word char.
2579 UChar32 c;
2580 U16_GET(inputBuf, fLookStart, pos, fLookLimit, c);
2581 if (u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND) || u_charType(c) == U_FORMAT_CHAR) {
2582 // Current char is a combining one. Not a boundary.
2583 return FALSE;
2584 }
2585 cIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(c);
2586 }
2587
2588 // Back up until we come to a non-combining char, determine whether
2589 // that char is a word char.
2590 UBool prevCIsWord = FALSE;
2591 for (;;) {
2592 if (pos <= fLookStart) {
2593 break;
2594 }
2595 UChar32 prevChar;
2596 U16_PREV(inputBuf, fLookStart, pos, prevChar);
2597 if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND)
2598 || u_charType(prevChar) == U_FORMAT_CHAR)) {
2599 prevCIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(prevChar);
2600 break;
2601 }
2602 }
2603 isBoundary = cIsWord ^ prevCIsWord;
2604 return isBoundary;
2605 }
2606
2607 //--------------------------------------------------------------------------------
2608 //
2609 // isUWordBoundary
2610 //
2611 // Test for a word boundary using RBBI word break.
2612 //
2613 // parameters: pos - the current position in the input buffer
2614 //
2615 //--------------------------------------------------------------------------------
isUWordBoundary(int64_t pos)2616 UBool RegexMatcher::isUWordBoundary(int64_t pos) {
2617 UBool returnVal = FALSE;
2618 #if UCONFIG_NO_BREAK_ITERATION==0
2619
2620 // If we haven't yet created a break iterator for this matcher, do it now.
2621 if (fWordBreakItr == NULL) {
2622 fWordBreakItr =
2623 (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), fDeferredStatus);
2624 if (U_FAILURE(fDeferredStatus)) {
2625 return FALSE;
2626 }
2627 fWordBreakItr->setText(fInputText, fDeferredStatus);
2628 }
2629
2630 if (pos >= fLookLimit) {
2631 fHitEnd = TRUE;
2632 returnVal = TRUE; // With Unicode word rules, only positions within the interior of "real"
2633 // words are not boundaries. All non-word chars stand by themselves,
2634 // with word boundaries on both sides.
2635 } else {
2636 if (!UTEXT_USES_U16(fInputText)) {
2637 // !!!: Would like a better way to do this!
2638 UErrorCode status = U_ZERO_ERROR;
2639 pos = utext_extract(fInputText, 0, pos, NULL, 0, &status);
2640 }
2641 returnVal = fWordBreakItr->isBoundary((int32_t)pos);
2642 }
2643 #endif
2644 return returnVal;
2645 }
2646
2647 //--------------------------------------------------------------------------------
2648 //
2649 // IncrementTime This function is called once each TIMER_INITIAL_VALUE state
2650 // saves. Increment the "time" counter, and call the
2651 // user callback function if there is one installed.
2652 //
2653 // If the match operation needs to be aborted, either for a time-out
2654 // or because the user callback asked for it, just set an error status.
2655 // The engine will pick that up and stop in its outer loop.
2656 //
2657 //--------------------------------------------------------------------------------
IncrementTime(UErrorCode & status)2658 void RegexMatcher::IncrementTime(UErrorCode &status) {
2659 fTickCounter = TIMER_INITIAL_VALUE;
2660 fTime++;
2661 if (fCallbackFn != NULL) {
2662 if ((*fCallbackFn)(fCallbackContext, fTime) == FALSE) {
2663 status = U_REGEX_STOPPED_BY_CALLER;
2664 return;
2665 }
2666 }
2667 if (fTimeLimit > 0 && fTime >= fTimeLimit) {
2668 status = U_REGEX_TIME_OUT;
2669 }
2670 }
2671
2672 //--------------------------------------------------------------------------------
2673 //
2674 // StateSave
2675 // Make a new stack frame, initialized as a copy of the current stack frame.
2676 // Set the pattern index in the original stack frame from the operand value
2677 // in the opcode. Execution of the engine continues with the state in
2678 // the newly created stack frame
2679 //
2680 // Note that reserveBlock() may grow the stack, resulting in the
2681 // whole thing being relocated in memory.
2682 //
2683 // Parameters:
2684 // fp The top frame pointer when called. At return, a new
2685 // fame will be present
2686 // savePatIdx An index into the compiled pattern. Goes into the original
2687 // (not new) frame. If execution ever back-tracks out of the
2688 // new frame, this will be where we continue from in the pattern.
2689 // Return
2690 // The new frame pointer.
2691 //
2692 //--------------------------------------------------------------------------------
StateSave(REStackFrame * fp,int64_t savePatIdx,UErrorCode & status)2693 inline REStackFrame *RegexMatcher::StateSave(REStackFrame *fp, int64_t savePatIdx, UErrorCode &status) {
2694 if (U_FAILURE(status)) {
2695 return fp;
2696 }
2697 // push storage for a new frame.
2698 int64_t *newFP = fStack->reserveBlock(fFrameSize, status);
2699 if (U_FAILURE(status)) {
2700 // Failure on attempted stack expansion.
2701 // Stack function set some other error code, change it to a more
2702 // specific one for regular expressions.
2703 status = U_REGEX_STACK_OVERFLOW;
2704 // We need to return a writable stack frame, so just return the
2705 // previous frame. The match operation will stop quickly
2706 // because of the error status, after which the frame will never
2707 // be looked at again.
2708 return fp;
2709 }
2710 fp = (REStackFrame *)(newFP - fFrameSize); // in case of realloc of stack.
2711
2712 // New stack frame = copy of old top frame.
2713 int64_t *source = (int64_t *)fp;
2714 int64_t *dest = newFP;
2715 for (;;) {
2716 *dest++ = *source++;
2717 if (source == newFP) {
2718 break;
2719 }
2720 }
2721
2722 fTickCounter--;
2723 if (fTickCounter <= 0) {
2724 IncrementTime(status); // Re-initializes fTickCounter
2725 }
2726 fp->fPatIdx = savePatIdx;
2727 return (REStackFrame *)newFP;
2728 }
2729
2730 #if defined(REGEX_DEBUG)
2731 namespace {
StringFromUText(UText * ut)2732 UnicodeString StringFromUText(UText *ut) {
2733 UnicodeString result;
2734 for (UChar32 c = utext_next32From(ut, 0); c != U_SENTINEL; c = UTEXT_NEXT32(ut)) {
2735 result.append(c);
2736 }
2737 return result;
2738 }
2739 }
2740 #endif // REGEX_DEBUG
2741
2742
2743 //--------------------------------------------------------------------------------
2744 //
2745 // MatchAt This is the actual matching engine.
2746 //
2747 // startIdx: begin matching a this index.
2748 // toEnd: if true, match must extend to end of the input region
2749 //
2750 //--------------------------------------------------------------------------------
MatchAt(int64_t startIdx,UBool toEnd,UErrorCode & status)2751 void RegexMatcher::MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status) {
2752 UBool isMatch = FALSE; // True if the we have a match.
2753
2754 int64_t backSearchIndex = U_INT64_MAX; // used after greedy single-character matches for searching backwards
2755
2756 int32_t op; // Operation from the compiled pattern, split into
2757 int32_t opType; // the opcode
2758 int32_t opValue; // and the operand value.
2759
2760 #ifdef REGEX_RUN_DEBUG
2761 if (fTraceDebug) {
2762 printf("MatchAt(startIdx=%ld)\n", startIdx);
2763 printf("Original Pattern: \"%s\"\n", CStr(StringFromUText(fPattern->fPattern))());
2764 printf("Input String: \"%s\"\n\n", CStr(StringFromUText(fInputText))());
2765 }
2766 #endif
2767
2768 if (U_FAILURE(status)) {
2769 return;
2770 }
2771
2772 // Cache frequently referenced items from the compiled pattern
2773 //
2774 int64_t *pat = fPattern->fCompiledPat->getBuffer();
2775
2776 const UChar *litText = fPattern->fLiteralText.getBuffer();
2777 UVector *fSets = fPattern->fSets;
2778
2779 fFrameSize = fPattern->fFrameSize;
2780 REStackFrame *fp = resetStack();
2781 if (U_FAILURE(fDeferredStatus)) {
2782 status = fDeferredStatus;
2783 return;
2784 }
2785
2786 fp->fPatIdx = 0;
2787 fp->fInputIdx = startIdx;
2788
2789 // Zero out the pattern's static data
2790 int32_t i;
2791 for (i = 0; i<fPattern->fDataSize; i++) {
2792 fData[i] = 0;
2793 }
2794
2795 //
2796 // Main loop for interpreting the compiled pattern.
2797 // One iteration of the loop per pattern operation performed.
2798 //
2799 for (;;) {
2800 op = (int32_t)pat[fp->fPatIdx];
2801 opType = URX_TYPE(op);
2802 opValue = URX_VAL(op);
2803 #ifdef REGEX_RUN_DEBUG
2804 if (fTraceDebug) {
2805 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2806 printf("inputIdx=%ld inputChar=%x sp=%3ld activeLimit=%ld ", fp->fInputIdx,
2807 UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(), fActiveLimit);
2808 fPattern->dumpOp(fp->fPatIdx);
2809 }
2810 #endif
2811 fp->fPatIdx++;
2812
2813 switch (opType) {
2814
2815
2816 case URX_NOP:
2817 break;
2818
2819
2820 case URX_BACKTRACK:
2821 // Force a backtrack. In some circumstances, the pattern compiler
2822 // will notice that the pattern can't possibly match anything, and will
2823 // emit one of these at that point.
2824 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2825 break;
2826
2827
2828 case URX_ONECHAR:
2829 if (fp->fInputIdx < fActiveLimit) {
2830 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2831 UChar32 c = UTEXT_NEXT32(fInputText);
2832 if (c == opValue) {
2833 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
2834 break;
2835 }
2836 } else {
2837 fHitEnd = TRUE;
2838 }
2839 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2840 break;
2841
2842
2843 case URX_STRING:
2844 {
2845 // Test input against a literal string.
2846 // Strings require two slots in the compiled pattern, one for the
2847 // offset to the string text, and one for the length.
2848
2849 int32_t stringStartIdx = opValue;
2850 op = (int32_t)pat[fp->fPatIdx]; // Fetch the second operand
2851 fp->fPatIdx++;
2852 opType = URX_TYPE(op);
2853 int32_t stringLen = URX_VAL(op);
2854 U_ASSERT(opType == URX_STRING_LEN);
2855 U_ASSERT(stringLen >= 2);
2856
2857 const UChar *patternString = litText+stringStartIdx;
2858 int32_t patternStringIndex = 0;
2859 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2860 UChar32 inputChar;
2861 UChar32 patternChar;
2862 UBool success = TRUE;
2863 while (patternStringIndex < stringLen) {
2864 if (UTEXT_GETNATIVEINDEX(fInputText) >= fActiveLimit) {
2865 success = FALSE;
2866 fHitEnd = TRUE;
2867 break;
2868 }
2869 inputChar = UTEXT_NEXT32(fInputText);
2870 U16_NEXT(patternString, patternStringIndex, stringLen, patternChar);
2871 if (patternChar != inputChar) {
2872 success = FALSE;
2873 break;
2874 }
2875 }
2876
2877 if (success) {
2878 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
2879 } else {
2880 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2881 }
2882 }
2883 break;
2884
2885
2886 case URX_STATE_SAVE:
2887 fp = StateSave(fp, opValue, status);
2888 break;
2889
2890
2891 case URX_END:
2892 // The match loop will exit via this path on a successful match,
2893 // when we reach the end of the pattern.
2894 if (toEnd && fp->fInputIdx != fActiveLimit) {
2895 // The pattern matched, but not to the end of input. Try some more.
2896 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2897 break;
2898 }
2899 isMatch = TRUE;
2900 goto breakFromLoop;
2901
2902 // Start and End Capture stack frame variables are laid out out like this:
2903 // fp->fExtra[opValue] - The start of a completed capture group
2904 // opValue+1 - The end of a completed capture group
2905 // opValue+2 - the start of a capture group whose end
2906 // has not yet been reached (and might not ever be).
2907 case URX_START_CAPTURE:
2908 U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
2909 fp->fExtra[opValue+2] = fp->fInputIdx;
2910 break;
2911
2912
2913 case URX_END_CAPTURE:
2914 U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
2915 U_ASSERT(fp->fExtra[opValue+2] >= 0); // Start pos for this group must be set.
2916 fp->fExtra[opValue] = fp->fExtra[opValue+2]; // Tentative start becomes real.
2917 fp->fExtra[opValue+1] = fp->fInputIdx; // End position
2918 U_ASSERT(fp->fExtra[opValue] <= fp->fExtra[opValue+1]);
2919 break;
2920
2921
2922 case URX_DOLLAR: // $, test for End of line
2923 // or for position before new line at end of input
2924 {
2925 if (fp->fInputIdx >= fAnchorLimit) {
2926 // We really are at the end of input. Success.
2927 fHitEnd = TRUE;
2928 fRequireEnd = TRUE;
2929 break;
2930 }
2931
2932 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2933
2934 // If we are positioned just before a new-line that is located at the
2935 // end of input, succeed.
2936 UChar32 c = UTEXT_NEXT32(fInputText);
2937 if (UTEXT_GETNATIVEINDEX(fInputText) >= fAnchorLimit) {
2938 if (isLineTerminator(c)) {
2939 // If not in the middle of a CR/LF sequence
2940 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && ((void)UTEXT_PREVIOUS32(fInputText), UTEXT_PREVIOUS32(fInputText))==0x0d)) {
2941 // At new-line at end of input. Success
2942 fHitEnd = TRUE;
2943 fRequireEnd = TRUE;
2944
2945 break;
2946 }
2947 }
2948 } else {
2949 UChar32 nextC = UTEXT_NEXT32(fInputText);
2950 if (c == 0x0d && nextC == 0x0a && UTEXT_GETNATIVEINDEX(fInputText) >= fAnchorLimit) {
2951 fHitEnd = TRUE;
2952 fRequireEnd = TRUE;
2953 break; // At CR/LF at end of input. Success
2954 }
2955 }
2956
2957 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2958 }
2959 break;
2960
2961
2962 case URX_DOLLAR_D: // $, test for End of Line, in UNIX_LINES mode.
2963 if (fp->fInputIdx >= fAnchorLimit) {
2964 // Off the end of input. Success.
2965 fHitEnd = TRUE;
2966 fRequireEnd = TRUE;
2967 break;
2968 } else {
2969 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2970 UChar32 c = UTEXT_NEXT32(fInputText);
2971 // Either at the last character of input, or off the end.
2972 if (c == 0x0a && UTEXT_GETNATIVEINDEX(fInputText) == fAnchorLimit) {
2973 fHitEnd = TRUE;
2974 fRequireEnd = TRUE;
2975 break;
2976 }
2977 }
2978
2979 // Not at end of input. Back-track out.
2980 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2981 break;
2982
2983
2984 case URX_DOLLAR_M: // $, test for End of line in multi-line mode
2985 {
2986 if (fp->fInputIdx >= fAnchorLimit) {
2987 // We really are at the end of input. Success.
2988 fHitEnd = TRUE;
2989 fRequireEnd = TRUE;
2990 break;
2991 }
2992 // If we are positioned just before a new-line, succeed.
2993 // It makes no difference where the new-line is within the input.
2994 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2995 UChar32 c = UTEXT_CURRENT32(fInputText);
2996 if (isLineTerminator(c)) {
2997 // At a line end, except for the odd chance of being in the middle of a CR/LF sequence
2998 // In multi-line mode, hitting a new-line just before the end of input does not
2999 // set the hitEnd or requireEnd flags
3000 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && UTEXT_PREVIOUS32(fInputText)==0x0d)) {
3001 break;
3002 }
3003 }
3004 // not at a new line. Fail.
3005 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3006 }
3007 break;
3008
3009
3010 case URX_DOLLAR_MD: // $, test for End of line in multi-line and UNIX_LINES mode
3011 {
3012 if (fp->fInputIdx >= fAnchorLimit) {
3013 // We really are at the end of input. Success.
3014 fHitEnd = TRUE;
3015 fRequireEnd = TRUE; // Java set requireEnd in this case, even though
3016 break; // adding a new-line would not lose the match.
3017 }
3018 // If we are not positioned just before a new-line, the test fails; backtrack out.
3019 // It makes no difference where the new-line is within the input.
3020 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3021 if (UTEXT_CURRENT32(fInputText) != 0x0a) {
3022 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3023 }
3024 }
3025 break;
3026
3027
3028 case URX_CARET: // ^, test for start of line
3029 if (fp->fInputIdx != fAnchorStart) {
3030 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3031 }
3032 break;
3033
3034
3035 case URX_CARET_M: // ^, test for start of line in mulit-line mode
3036 {
3037 if (fp->fInputIdx == fAnchorStart) {
3038 // We are at the start input. Success.
3039 break;
3040 }
3041 // Check whether character just before the current pos is a new-line
3042 // unless we are at the end of input
3043 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3044 UChar32 c = UTEXT_PREVIOUS32(fInputText);
3045 if ((fp->fInputIdx < fAnchorLimit) && isLineTerminator(c)) {
3046 // It's a new-line. ^ is true. Success.
3047 // TODO: what should be done with positions between a CR and LF?
3048 break;
3049 }
3050 // Not at the start of a line. Fail.
3051 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3052 }
3053 break;
3054
3055
3056 case URX_CARET_M_UNIX: // ^, test for start of line in mulit-line + Unix-line mode
3057 {
3058 U_ASSERT(fp->fInputIdx >= fAnchorStart);
3059 if (fp->fInputIdx <= fAnchorStart) {
3060 // We are at the start input. Success.
3061 break;
3062 }
3063 // Check whether character just before the current pos is a new-line
3064 U_ASSERT(fp->fInputIdx <= fAnchorLimit);
3065 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3066 UChar32 c = UTEXT_PREVIOUS32(fInputText);
3067 if (c != 0x0a) {
3068 // Not at the start of a line. Back-track out.
3069 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3070 }
3071 }
3072 break;
3073
3074 case URX_BACKSLASH_B: // Test for word boundaries
3075 {
3076 UBool success = isWordBoundary(fp->fInputIdx);
3077 success ^= (UBool)(opValue != 0); // flip sense for \B
3078 if (!success) {
3079 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3080 }
3081 }
3082 break;
3083
3084
3085 case URX_BACKSLASH_BU: // Test for word boundaries, Unicode-style
3086 {
3087 UBool success = isUWordBoundary(fp->fInputIdx);
3088 success ^= (UBool)(opValue != 0); // flip sense for \B
3089 if (!success) {
3090 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3091 }
3092 }
3093 break;
3094
3095
3096 case URX_BACKSLASH_D: // Test for decimal digit
3097 {
3098 if (fp->fInputIdx >= fActiveLimit) {
3099 fHitEnd = TRUE;
3100 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3101 break;
3102 }
3103
3104 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3105
3106 UChar32 c = UTEXT_NEXT32(fInputText);
3107 int8_t ctype = u_charType(c); // TODO: make a unicode set for this. Will be faster.
3108 UBool success = (ctype == U_DECIMAL_DIGIT_NUMBER);
3109 success ^= (UBool)(opValue != 0); // flip sense for \D
3110 if (success) {
3111 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3112 } else {
3113 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3114 }
3115 }
3116 break;
3117
3118
3119 case URX_BACKSLASH_G: // Test for position at end of previous match
3120 if (!((fMatch && fp->fInputIdx==fMatchEnd) || (fMatch==FALSE && fp->fInputIdx==fActiveStart))) {
3121 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3122 }
3123 break;
3124
3125
3126 case URX_BACKSLASH_H: // Test for \h, horizontal white space.
3127 {
3128 if (fp->fInputIdx >= fActiveLimit) {
3129 fHitEnd = TRUE;
3130 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3131 break;
3132 }
3133 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3134 UChar32 c = UTEXT_NEXT32(fInputText);
3135 int8_t ctype = u_charType(c);
3136 UBool success = (ctype == U_SPACE_SEPARATOR || c == 9); // SPACE_SEPARATOR || TAB
3137 success ^= (UBool)(opValue != 0); // flip sense for \H
3138 if (success) {
3139 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3140 } else {
3141 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3142 }
3143 }
3144 break;
3145
3146
3147 case URX_BACKSLASH_R: // Test for \R, any line break sequence.
3148 {
3149 if (fp->fInputIdx >= fActiveLimit) {
3150 fHitEnd = TRUE;
3151 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3152 break;
3153 }
3154 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3155 UChar32 c = UTEXT_NEXT32(fInputText);
3156 if (isLineTerminator(c)) {
3157 if (c == 0x0d && utext_current32(fInputText) == 0x0a) {
3158 utext_next32(fInputText);
3159 }
3160 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3161 } else {
3162 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3163 }
3164 }
3165 break;
3166
3167
3168 case URX_BACKSLASH_V: // \v, any single line ending character.
3169 {
3170 if (fp->fInputIdx >= fActiveLimit) {
3171 fHitEnd = TRUE;
3172 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3173 break;
3174 }
3175 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3176 UChar32 c = UTEXT_NEXT32(fInputText);
3177 UBool success = isLineTerminator(c);
3178 success ^= (UBool)(opValue != 0); // flip sense for \V
3179 if (success) {
3180 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3181 } else {
3182 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3183 }
3184 }
3185 break;
3186
3187
3188 case URX_BACKSLASH_X:
3189 // Match a Grapheme, as defined by Unicode TR 29.
3190 // Differs slightly from Perl, which consumes combining marks independently
3191 // of context.
3192 {
3193
3194 // Fail if at end of input
3195 if (fp->fInputIdx >= fActiveLimit) {
3196 fHitEnd = TRUE;
3197 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3198 break;
3199 }
3200
3201 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3202
3203 // Examine (and consume) the current char.
3204 // Dispatch into a little state machine, based on the char.
3205 UChar32 c;
3206 c = UTEXT_NEXT32(fInputText);
3207 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3208 UnicodeSet **sets = fPattern->fStaticSets;
3209 if (sets[URX_GC_NORMAL]->contains(c)) goto GC_Extend;
3210 if (sets[URX_GC_CONTROL]->contains(c)) goto GC_Control;
3211 if (sets[URX_GC_L]->contains(c)) goto GC_L;
3212 if (sets[URX_GC_LV]->contains(c)) goto GC_V;
3213 if (sets[URX_GC_LVT]->contains(c)) goto GC_T;
3214 if (sets[URX_GC_V]->contains(c)) goto GC_V;
3215 if (sets[URX_GC_T]->contains(c)) goto GC_T;
3216 goto GC_Extend;
3217
3218
3219
3220 GC_L:
3221 if (fp->fInputIdx >= fActiveLimit) goto GC_Done;
3222 c = UTEXT_NEXT32(fInputText);
3223 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3224 if (sets[URX_GC_L]->contains(c)) goto GC_L;
3225 if (sets[URX_GC_LV]->contains(c)) goto GC_V;
3226 if (sets[URX_GC_LVT]->contains(c)) goto GC_T;
3227 if (sets[URX_GC_V]->contains(c)) goto GC_V;
3228 (void)UTEXT_PREVIOUS32(fInputText);
3229 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3230 goto GC_Extend;
3231
3232 GC_V:
3233 if (fp->fInputIdx >= fActiveLimit) goto GC_Done;
3234 c = UTEXT_NEXT32(fInputText);
3235 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3236 if (sets[URX_GC_V]->contains(c)) goto GC_V;
3237 if (sets[URX_GC_T]->contains(c)) goto GC_T;
3238 (void)UTEXT_PREVIOUS32(fInputText);
3239 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3240 goto GC_Extend;
3241
3242 GC_T:
3243 if (fp->fInputIdx >= fActiveLimit) goto GC_Done;
3244 c = UTEXT_NEXT32(fInputText);
3245 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3246 if (sets[URX_GC_T]->contains(c)) goto GC_T;
3247 (void)UTEXT_PREVIOUS32(fInputText);
3248 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3249 goto GC_Extend;
3250
3251 GC_Extend:
3252 // Combining characters are consumed here
3253 for (;;) {
3254 if (fp->fInputIdx >= fActiveLimit) {
3255 break;
3256 }
3257 c = UTEXT_CURRENT32(fInputText);
3258 if (sets[URX_GC_EXTEND]->contains(c) == FALSE) {
3259 break;
3260 }
3261 (void)UTEXT_NEXT32(fInputText);
3262 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3263 }
3264 goto GC_Done;
3265
3266 GC_Control:
3267 // Most control chars stand alone (don't combine with combining chars),
3268 // except for that CR/LF sequence is a single grapheme cluster.
3269 if (c == 0x0d && fp->fInputIdx < fActiveLimit && UTEXT_CURRENT32(fInputText) == 0x0a) {
3270 c = UTEXT_NEXT32(fInputText);
3271 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3272 }
3273
3274 GC_Done:
3275 if (fp->fInputIdx >= fActiveLimit) {
3276 fHitEnd = TRUE;
3277 }
3278 break;
3279 }
3280
3281
3282
3283
3284 case URX_BACKSLASH_Z: // Test for end of Input
3285 if (fp->fInputIdx < fAnchorLimit) {
3286 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3287 } else {
3288 fHitEnd = TRUE;
3289 fRequireEnd = TRUE;
3290 }
3291 break;
3292
3293
3294
3295 case URX_STATIC_SETREF:
3296 {
3297 // Test input character against one of the predefined sets
3298 // (Word Characters, for example)
3299 // The high bit of the op value is a flag for the match polarity.
3300 // 0: success if input char is in set.
3301 // 1: success if input char is not in set.
3302 if (fp->fInputIdx >= fActiveLimit) {
3303 fHitEnd = TRUE;
3304 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3305 break;
3306 }
3307
3308 UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET);
3309 opValue &= ~URX_NEG_SET;
3310 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
3311
3312 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3313 UChar32 c = UTEXT_NEXT32(fInputText);
3314 if (c < 256) {
3315 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
3316 if (s8->contains(c)) {
3317 success = !success;
3318 }
3319 } else {
3320 const UnicodeSet *s = fPattern->fStaticSets[opValue];
3321 if (s->contains(c)) {
3322 success = !success;
3323 }
3324 }
3325 if (success) {
3326 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3327 } else {
3328 // the character wasn't in the set.
3329 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3330 }
3331 }
3332 break;
3333
3334
3335 case URX_STAT_SETREF_N:
3336 {
3337 // Test input character for NOT being a member of one of
3338 // the predefined sets (Word Characters, for example)
3339 if (fp->fInputIdx >= fActiveLimit) {
3340 fHitEnd = TRUE;
3341 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3342 break;
3343 }
3344
3345 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
3346
3347 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3348
3349 UChar32 c = UTEXT_NEXT32(fInputText);
3350 if (c < 256) {
3351 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
3352 if (s8->contains(c) == FALSE) {
3353 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3354 break;
3355 }
3356 } else {
3357 const UnicodeSet *s = fPattern->fStaticSets[opValue];
3358 if (s->contains(c) == FALSE) {
3359 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3360 break;
3361 }
3362 }
3363 // the character wasn't in the set.
3364 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3365 }
3366 break;
3367
3368
3369 case URX_SETREF:
3370 if (fp->fInputIdx >= fActiveLimit) {
3371 fHitEnd = TRUE;
3372 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3373 break;
3374 } else {
3375 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3376
3377 // There is input left. Pick up one char and test it for set membership.
3378 UChar32 c = UTEXT_NEXT32(fInputText);
3379 U_ASSERT(opValue > 0 && opValue < fSets->size());
3380 if (c<256) {
3381 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
3382 if (s8->contains(c)) {
3383 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3384 break;
3385 }
3386 } else {
3387 UnicodeSet *s = (UnicodeSet *)fSets->elementAt(opValue);
3388 if (s->contains(c)) {
3389 // The character is in the set. A Match.
3390 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3391 break;
3392 }
3393 }
3394
3395 // the character wasn't in the set.
3396 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3397 }
3398 break;
3399
3400
3401 case URX_DOTANY:
3402 {
3403 // . matches anything, but stops at end-of-line.
3404 if (fp->fInputIdx >= fActiveLimit) {
3405 // At end of input. Match failed. Backtrack out.
3406 fHitEnd = TRUE;
3407 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3408 break;
3409 }
3410
3411 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3412
3413 // There is input left. Advance over one char, unless we've hit end-of-line
3414 UChar32 c = UTEXT_NEXT32(fInputText);
3415 if (isLineTerminator(c)) {
3416 // End of line in normal mode. . does not match.
3417 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3418 break;
3419 }
3420 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3421 }
3422 break;
3423
3424
3425 case URX_DOTANY_ALL:
3426 {
3427 // ., in dot-matches-all (including new lines) mode
3428 if (fp->fInputIdx >= fActiveLimit) {
3429 // At end of input. Match failed. Backtrack out.
3430 fHitEnd = TRUE;
3431 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3432 break;
3433 }
3434
3435 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3436
3437 // There is input left. Advance over one char, except if we are
3438 // at a cr/lf, advance over both of them.
3439 UChar32 c;
3440 c = UTEXT_NEXT32(fInputText);
3441 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3442 if (c==0x0d && fp->fInputIdx < fActiveLimit) {
3443 // In the case of a CR/LF, we need to advance over both.
3444 UChar32 nextc = UTEXT_CURRENT32(fInputText);
3445 if (nextc == 0x0a) {
3446 (void)UTEXT_NEXT32(fInputText);
3447 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3448 }
3449 }
3450 }
3451 break;
3452
3453
3454 case URX_DOTANY_UNIX:
3455 {
3456 // '.' operator, matches all, but stops at end-of-line.
3457 // UNIX_LINES mode, so 0x0a is the only recognized line ending.
3458 if (fp->fInputIdx >= fActiveLimit) {
3459 // At end of input. Match failed. Backtrack out.
3460 fHitEnd = TRUE;
3461 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3462 break;
3463 }
3464
3465 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3466
3467 // There is input left. Advance over one char, unless we've hit end-of-line
3468 UChar32 c = UTEXT_NEXT32(fInputText);
3469 if (c == 0x0a) {
3470 // End of line in normal mode. '.' does not match the \n
3471 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3472 } else {
3473 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3474 }
3475 }
3476 break;
3477
3478
3479 case URX_JMP:
3480 fp->fPatIdx = opValue;
3481 break;
3482
3483 case URX_FAIL:
3484 isMatch = FALSE;
3485 goto breakFromLoop;
3486
3487 case URX_JMP_SAV:
3488 U_ASSERT(opValue < fPattern->fCompiledPat->size());
3489 fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current
3490 fp->fPatIdx = opValue; // Then JMP.
3491 break;
3492
3493 case URX_JMP_SAV_X:
3494 // This opcode is used with (x)+, when x can match a zero length string.
3495 // Same as JMP_SAV, except conditional on the match having made forward progress.
3496 // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the
3497 // data address of the input position at the start of the loop.
3498 {
3499 U_ASSERT(opValue > 0 && opValue < fPattern->fCompiledPat->size());
3500 int32_t stoOp = (int32_t)pat[opValue-1];
3501 U_ASSERT(URX_TYPE(stoOp) == URX_STO_INP_LOC);
3502 int32_t frameLoc = URX_VAL(stoOp);
3503 U_ASSERT(frameLoc >= 0 && frameLoc < fFrameSize);
3504 int64_t prevInputIdx = fp->fExtra[frameLoc];
3505 U_ASSERT(prevInputIdx <= fp->fInputIdx);
3506 if (prevInputIdx < fp->fInputIdx) {
3507 // The match did make progress. Repeat the loop.
3508 fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current
3509 fp->fPatIdx = opValue;
3510 fp->fExtra[frameLoc] = fp->fInputIdx;
3511 }
3512 // If the input position did not advance, we do nothing here,
3513 // execution will fall out of the loop.
3514 }
3515 break;
3516
3517 case URX_CTR_INIT:
3518 {
3519 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
3520 fp->fExtra[opValue] = 0; // Set the loop counter variable to zero
3521
3522 // Pick up the three extra operands that CTR_INIT has, and
3523 // skip the pattern location counter past
3524 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
3525 fp->fPatIdx += 3;
3526 int32_t loopLoc = URX_VAL(pat[instrOperandLoc]);
3527 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
3528 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
3529 U_ASSERT(minCount>=0);
3530 U_ASSERT(maxCount>=minCount || maxCount==-1);
3531 U_ASSERT(loopLoc>=fp->fPatIdx);
3532
3533 if (minCount == 0) {
3534 fp = StateSave(fp, loopLoc+1, status);
3535 }
3536 if (maxCount == -1) {
3537 fp->fExtra[opValue+1] = fp->fInputIdx; // For loop breaking.
3538 } else if (maxCount == 0) {
3539 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3540 }
3541 }
3542 break;
3543
3544 case URX_CTR_LOOP:
3545 {
3546 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
3547 int32_t initOp = (int32_t)pat[opValue];
3548 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT);
3549 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
3550 int32_t minCount = (int32_t)pat[opValue+2];
3551 int32_t maxCount = (int32_t)pat[opValue+3];
3552 (*pCounter)++;
3553 if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) {
3554 U_ASSERT(*pCounter == maxCount);
3555 break;
3556 }
3557 if (*pCounter >= minCount) {
3558 if (maxCount == -1) {
3559 // Loop has no hard upper bound.
3560 // Check that it is progressing through the input, break if it is not.
3561 int64_t *pLastInputIdx = &fp->fExtra[URX_VAL(initOp) + 1];
3562 if (fp->fInputIdx == *pLastInputIdx) {
3563 break;
3564 } else {
3565 *pLastInputIdx = fp->fInputIdx;
3566 }
3567 }
3568 fp = StateSave(fp, fp->fPatIdx, status);
3569 } else {
3570 // Increment time-out counter. (StateSave() does it if count >= minCount)
3571 fTickCounter--;
3572 if (fTickCounter <= 0) {
3573 IncrementTime(status); // Re-initializes fTickCounter
3574 }
3575 }
3576
3577 fp->fPatIdx = opValue + 4; // Loop back.
3578 }
3579 break;
3580
3581 case URX_CTR_INIT_NG:
3582 {
3583 // Initialize a non-greedy loop
3584 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
3585 fp->fExtra[opValue] = 0; // Set the loop counter variable to zero
3586
3587 // Pick up the three extra operands that CTR_INIT_NG has, and
3588 // skip the pattern location counter past
3589 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
3590 fp->fPatIdx += 3;
3591 int32_t loopLoc = URX_VAL(pat[instrOperandLoc]);
3592 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
3593 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
3594 U_ASSERT(minCount>=0);
3595 U_ASSERT(maxCount>=minCount || maxCount==-1);
3596 U_ASSERT(loopLoc>fp->fPatIdx);
3597 if (maxCount == -1) {
3598 fp->fExtra[opValue+1] = fp->fInputIdx; // Save initial input index for loop breaking.
3599 }
3600
3601 if (minCount == 0) {
3602 if (maxCount != 0) {
3603 fp = StateSave(fp, fp->fPatIdx, status);
3604 }
3605 fp->fPatIdx = loopLoc+1; // Continue with stuff after repeated block
3606 }
3607 }
3608 break;
3609
3610 case URX_CTR_LOOP_NG:
3611 {
3612 // Non-greedy {min, max} loops
3613 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
3614 int32_t initOp = (int32_t)pat[opValue];
3615 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT_NG);
3616 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
3617 int32_t minCount = (int32_t)pat[opValue+2];
3618 int32_t maxCount = (int32_t)pat[opValue+3];
3619
3620 (*pCounter)++;
3621 if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) {
3622 // The loop has matched the maximum permitted number of times.
3623 // Break out of here with no action. Matching will
3624 // continue with the following pattern.
3625 U_ASSERT(*pCounter == maxCount);
3626 break;
3627 }
3628
3629 if (*pCounter < minCount) {
3630 // We haven't met the minimum number of matches yet.
3631 // Loop back for another one.
3632 fp->fPatIdx = opValue + 4; // Loop back.
3633 // Increment time-out counter. (StateSave() does it if count >= minCount)
3634 fTickCounter--;
3635 if (fTickCounter <= 0) {
3636 IncrementTime(status); // Re-initializes fTickCounter
3637 }
3638 } else {
3639 // We do have the minimum number of matches.
3640
3641 // If there is no upper bound on the loop iterations, check that the input index
3642 // is progressing, and stop the loop if it is not.
3643 if (maxCount == -1) {
3644 int64_t *pLastInputIdx = &fp->fExtra[URX_VAL(initOp) + 1];
3645 if (fp->fInputIdx == *pLastInputIdx) {
3646 break;
3647 }
3648 *pLastInputIdx = fp->fInputIdx;
3649 }
3650
3651 // Loop Continuation: we will fall into the pattern following the loop
3652 // (non-greedy, don't execute loop body first), but first do
3653 // a state save to the top of the loop, so that a match failure
3654 // in the following pattern will try another iteration of the loop.
3655 fp = StateSave(fp, opValue + 4, status);
3656 }
3657 }
3658 break;
3659
3660 case URX_STO_SP:
3661 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
3662 fData[opValue] = fStack->size();
3663 break;
3664
3665 case URX_LD_SP:
3666 {
3667 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
3668 int32_t newStackSize = (int32_t)fData[opValue];
3669 U_ASSERT(newStackSize <= fStack->size());
3670 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
3671 if (newFP == (int64_t *)fp) {
3672 break;
3673 }
3674 int32_t j;
3675 for (j=0; j<fFrameSize; j++) {
3676 newFP[j] = ((int64_t *)fp)[j];
3677 }
3678 fp = (REStackFrame *)newFP;
3679 fStack->setSize(newStackSize);
3680 }
3681 break;
3682
3683 case URX_BACKREF:
3684 {
3685 U_ASSERT(opValue < fFrameSize);
3686 int64_t groupStartIdx = fp->fExtra[opValue];
3687 int64_t groupEndIdx = fp->fExtra[opValue+1];
3688 U_ASSERT(groupStartIdx <= groupEndIdx);
3689 if (groupStartIdx < 0) {
3690 // This capture group has not participated in the match thus far,
3691 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match.
3692 break;
3693 }
3694 UTEXT_SETNATIVEINDEX(fAltInputText, groupStartIdx);
3695 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3696
3697 // Note: if the capture group match was of an empty string the backref
3698 // match succeeds. Verified by testing: Perl matches succeed
3699 // in this case, so we do too.
3700
3701 UBool success = TRUE;
3702 for (;;) {
3703 if (utext_getNativeIndex(fAltInputText) >= groupEndIdx) {
3704 success = TRUE;
3705 break;
3706 }
3707 if (utext_getNativeIndex(fInputText) >= fActiveLimit) {
3708 success = FALSE;
3709 fHitEnd = TRUE;
3710 break;
3711 }
3712 UChar32 captureGroupChar = utext_next32(fAltInputText);
3713 UChar32 inputChar = utext_next32(fInputText);
3714 if (inputChar != captureGroupChar) {
3715 success = FALSE;
3716 break;
3717 }
3718 }
3719
3720 if (success) {
3721 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3722 } else {
3723 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3724 }
3725 }
3726 break;
3727
3728
3729
3730 case URX_BACKREF_I:
3731 {
3732 U_ASSERT(opValue < fFrameSize);
3733 int64_t groupStartIdx = fp->fExtra[opValue];
3734 int64_t groupEndIdx = fp->fExtra[opValue+1];
3735 U_ASSERT(groupStartIdx <= groupEndIdx);
3736 if (groupStartIdx < 0) {
3737 // This capture group has not participated in the match thus far,
3738 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match.
3739 break;
3740 }
3741 utext_setNativeIndex(fAltInputText, groupStartIdx);
3742 utext_setNativeIndex(fInputText, fp->fInputIdx);
3743 CaseFoldingUTextIterator captureGroupItr(*fAltInputText);
3744 CaseFoldingUTextIterator inputItr(*fInputText);
3745
3746 // Note: if the capture group match was of an empty string the backref
3747 // match succeeds. Verified by testing: Perl matches succeed
3748 // in this case, so we do too.
3749
3750 UBool success = TRUE;
3751 for (;;) {
3752 if (!captureGroupItr.inExpansion() && utext_getNativeIndex(fAltInputText) >= groupEndIdx) {
3753 success = TRUE;
3754 break;
3755 }
3756 if (!inputItr.inExpansion() && utext_getNativeIndex(fInputText) >= fActiveLimit) {
3757 success = FALSE;
3758 fHitEnd = TRUE;
3759 break;
3760 }
3761 UChar32 captureGroupChar = captureGroupItr.next();
3762 UChar32 inputChar = inputItr.next();
3763 if (inputChar != captureGroupChar) {
3764 success = FALSE;
3765 break;
3766 }
3767 }
3768
3769 if (success && inputItr.inExpansion()) {
3770 // We otained a match by consuming part of a string obtained from
3771 // case-folding a single code point of the input text.
3772 // This does not count as an overall match.
3773 success = FALSE;
3774 }
3775
3776 if (success) {
3777 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3778 } else {
3779 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3780 }
3781
3782 }
3783 break;
3784
3785 case URX_STO_INP_LOC:
3786 {
3787 U_ASSERT(opValue >= 0 && opValue < fFrameSize);
3788 fp->fExtra[opValue] = fp->fInputIdx;
3789 }
3790 break;
3791
3792 case URX_JMPX:
3793 {
3794 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
3795 fp->fPatIdx += 1;
3796 int32_t dataLoc = URX_VAL(pat[instrOperandLoc]);
3797 U_ASSERT(dataLoc >= 0 && dataLoc < fFrameSize);
3798 int64_t savedInputIdx = fp->fExtra[dataLoc];
3799 U_ASSERT(savedInputIdx <= fp->fInputIdx);
3800 if (savedInputIdx < fp->fInputIdx) {
3801 fp->fPatIdx = opValue; // JMP
3802 } else {
3803 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no progress in loop.
3804 }
3805 }
3806 break;
3807
3808 case URX_LA_START:
3809 {
3810 // Entering a lookahead block.
3811 // Save Stack Ptr, Input Pos.
3812 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
3813 fData[opValue] = fStack->size();
3814 fData[opValue+1] = fp->fInputIdx;
3815 fActiveStart = fLookStart; // Set the match region change for
3816 fActiveLimit = fLookLimit; // transparent bounds.
3817 }
3818 break;
3819
3820 case URX_LA_END:
3821 {
3822 // Leaving a look-ahead block.
3823 // restore Stack Ptr, Input Pos to positions they had on entry to block.
3824 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
3825 int32_t stackSize = fStack->size();
3826 int32_t newStackSize =(int32_t)fData[opValue];
3827 U_ASSERT(stackSize >= newStackSize);
3828 if (stackSize > newStackSize) {
3829 // Copy the current top frame back to the new (cut back) top frame.
3830 // This makes the capture groups from within the look-ahead
3831 // expression available.
3832 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
3833 int32_t j;
3834 for (j=0; j<fFrameSize; j++) {
3835 newFP[j] = ((int64_t *)fp)[j];
3836 }
3837 fp = (REStackFrame *)newFP;
3838 fStack->setSize(newStackSize);
3839 }
3840 fp->fInputIdx = fData[opValue+1];
3841
3842 // Restore the active region bounds in the input string; they may have
3843 // been changed because of transparent bounds on a Region.
3844 fActiveStart = fRegionStart;
3845 fActiveLimit = fRegionLimit;
3846 }
3847 break;
3848
3849 case URX_ONECHAR_I:
3850 // Case insensitive one char. The char from the pattern is already case folded.
3851 // Input text is not, but case folding the input can not reduce two or more code
3852 // points to one.
3853 if (fp->fInputIdx < fActiveLimit) {
3854 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3855
3856 UChar32 c = UTEXT_NEXT32(fInputText);
3857 if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) {
3858 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3859 break;
3860 }
3861 } else {
3862 fHitEnd = TRUE;
3863 }
3864
3865 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3866 break;
3867
3868 case URX_STRING_I:
3869 {
3870 // Case-insensitive test input against a literal string.
3871 // Strings require two slots in the compiled pattern, one for the
3872 // offset to the string text, and one for the length.
3873 // The compiled string has already been case folded.
3874 {
3875 const UChar *patternString = litText + opValue;
3876 int32_t patternStringIdx = 0;
3877
3878 op = (int32_t)pat[fp->fPatIdx];
3879 fp->fPatIdx++;
3880 opType = URX_TYPE(op);
3881 opValue = URX_VAL(op);
3882 U_ASSERT(opType == URX_STRING_LEN);
3883 int32_t patternStringLen = opValue; // Length of the string from the pattern.
3884
3885
3886 UChar32 cPattern;
3887 UChar32 cText;
3888 UBool success = TRUE;
3889
3890 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3891 CaseFoldingUTextIterator inputIterator(*fInputText);
3892 while (patternStringIdx < patternStringLen) {
3893 if (!inputIterator.inExpansion() && UTEXT_GETNATIVEINDEX(fInputText) >= fActiveLimit) {
3894 success = FALSE;
3895 fHitEnd = TRUE;
3896 break;
3897 }
3898 U16_NEXT(patternString, patternStringIdx, patternStringLen, cPattern);
3899 cText = inputIterator.next();
3900 if (cText != cPattern) {
3901 success = FALSE;
3902 break;
3903 }
3904 }
3905 if (inputIterator.inExpansion()) {
3906 success = FALSE;
3907 }
3908
3909 if (success) {
3910 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3911 } else {
3912 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3913 }
3914 }
3915 }
3916 break;
3917
3918 case URX_LB_START:
3919 {
3920 // Entering a look-behind block.
3921 // Save Stack Ptr, Input Pos.
3922 // TODO: implement transparent bounds. Ticket #6067
3923 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
3924 fData[opValue] = fStack->size();
3925 fData[opValue+1] = fp->fInputIdx;
3926 // Init the variable containing the start index for attempted matches.
3927 fData[opValue+2] = -1;
3928 // Save input string length, then reset to pin any matches to end at
3929 // the current position.
3930 fData[opValue+3] = fActiveLimit;
3931 fActiveLimit = fp->fInputIdx;
3932 }
3933 break;
3934
3935
3936 case URX_LB_CONT:
3937 {
3938 // Positive Look-Behind, at top of loop checking for matches of LB expression
3939 // at all possible input starting positions.
3940
3941 // Fetch the min and max possible match lengths. They are the operands
3942 // of this op in the pattern.
3943 int32_t minML = (int32_t)pat[fp->fPatIdx++];
3944 int32_t maxML = (int32_t)pat[fp->fPatIdx++];
3945 if (!UTEXT_USES_U16(fInputText)) {
3946 // utf-8 fix to maximum match length. The pattern compiler assumes utf-16.
3947 // The max length need not be exact; it just needs to be >= actual maximum.
3948 maxML *= 3;
3949 }
3950 U_ASSERT(minML <= maxML);
3951 U_ASSERT(minML >= 0);
3952
3953 // Fetch (from data) the last input index where a match was attempted.
3954 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
3955 int64_t &lbStartIdx = fData[opValue+2];
3956 if (lbStartIdx < 0) {
3957 // First time through loop.
3958 lbStartIdx = fp->fInputIdx - minML;
3959 if (lbStartIdx > 0) {
3960 // move index to a code point boudary, if it's not on one already.
3961 UTEXT_SETNATIVEINDEX(fInputText, lbStartIdx);
3962 lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
3963 }
3964 } else {
3965 // 2nd through nth time through the loop.
3966 // Back up start position for match by one.
3967 if (lbStartIdx == 0) {
3968 (lbStartIdx)--;
3969 } else {
3970 UTEXT_SETNATIVEINDEX(fInputText, lbStartIdx);
3971 (void)UTEXT_PREVIOUS32(fInputText);
3972 lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
3973 }
3974 }
3975
3976 if (lbStartIdx < 0 || lbStartIdx < fp->fInputIdx - maxML) {
3977 // We have tried all potential match starting points without
3978 // getting a match. Backtrack out, and out of the
3979 // Look Behind altogether.
3980 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3981 int64_t restoreInputLen = fData[opValue+3];
3982 U_ASSERT(restoreInputLen >= fActiveLimit);
3983 U_ASSERT(restoreInputLen <= fInputLength);
3984 fActiveLimit = restoreInputLen;
3985 break;
3986 }
3987
3988 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
3989 // (successful match will fall off the end of the loop.)
3990 fp = StateSave(fp, fp->fPatIdx-3, status);
3991 fp->fInputIdx = lbStartIdx;
3992 }
3993 break;
3994
3995 case URX_LB_END:
3996 // End of a look-behind block, after a successful match.
3997 {
3998 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
3999 if (fp->fInputIdx != fActiveLimit) {
4000 // The look-behind expression matched, but the match did not
4001 // extend all the way to the point that we are looking behind from.
4002 // FAIL out of here, which will take us back to the LB_CONT, which
4003 // will retry the match starting at another position or fail
4004 // the look-behind altogether, whichever is appropriate.
4005 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4006 break;
4007 }
4008
4009 // Look-behind match is good. Restore the orignal input string length,
4010 // which had been truncated to pin the end of the lookbehind match to the
4011 // position being looked-behind.
4012 int64_t originalInputLen = fData[opValue+3];
4013 U_ASSERT(originalInputLen >= fActiveLimit);
4014 U_ASSERT(originalInputLen <= fInputLength);
4015 fActiveLimit = originalInputLen;
4016 }
4017 break;
4018
4019
4020 case URX_LBN_CONT:
4021 {
4022 // Negative Look-Behind, at top of loop checking for matches of LB expression
4023 // at all possible input starting positions.
4024
4025 // Fetch the extra parameters of this op.
4026 int32_t minML = (int32_t)pat[fp->fPatIdx++];
4027 int32_t maxML = (int32_t)pat[fp->fPatIdx++];
4028 if (!UTEXT_USES_U16(fInputText)) {
4029 // utf-8 fix to maximum match length. The pattern compiler assumes utf-16.
4030 // The max length need not be exact; it just needs to be >= actual maximum.
4031 maxML *= 3;
4032 }
4033 int32_t continueLoc = (int32_t)pat[fp->fPatIdx++];
4034 continueLoc = URX_VAL(continueLoc);
4035 U_ASSERT(minML <= maxML);
4036 U_ASSERT(minML >= 0);
4037 U_ASSERT(continueLoc > fp->fPatIdx);
4038
4039 // Fetch (from data) the last input index where a match was attempted.
4040 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
4041 int64_t &lbStartIdx = fData[opValue+2];
4042 if (lbStartIdx < 0) {
4043 // First time through loop.
4044 lbStartIdx = fp->fInputIdx - minML;
4045 if (lbStartIdx > 0) {
4046 // move index to a code point boudary, if it's not on one already.
4047 UTEXT_SETNATIVEINDEX(fInputText, lbStartIdx);
4048 lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
4049 }
4050 } else {
4051 // 2nd through nth time through the loop.
4052 // Back up start position for match by one.
4053 if (lbStartIdx == 0) {
4054 (lbStartIdx)--;
4055 } else {
4056 UTEXT_SETNATIVEINDEX(fInputText, lbStartIdx);
4057 (void)UTEXT_PREVIOUS32(fInputText);
4058 lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
4059 }
4060 }
4061
4062 if (lbStartIdx < 0 || lbStartIdx < fp->fInputIdx - maxML) {
4063 // We have tried all potential match starting points without
4064 // getting a match, which means that the negative lookbehind as
4065 // a whole has succeeded. Jump forward to the continue location
4066 int64_t restoreInputLen = fData[opValue+3];
4067 U_ASSERT(restoreInputLen >= fActiveLimit);
4068 U_ASSERT(restoreInputLen <= fInputLength);
4069 fActiveLimit = restoreInputLen;
4070 fp->fPatIdx = continueLoc;
4071 break;
4072 }
4073
4074 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
4075 // (successful match will cause a FAIL out of the loop altogether.)
4076 fp = StateSave(fp, fp->fPatIdx-4, status);
4077 fp->fInputIdx = lbStartIdx;
4078 }
4079 break;
4080
4081 case URX_LBN_END:
4082 // End of a negative look-behind block, after a successful match.
4083 {
4084 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
4085 if (fp->fInputIdx != fActiveLimit) {
4086 // The look-behind expression matched, but the match did not
4087 // extend all the way to the point that we are looking behind from.
4088 // FAIL out of here, which will take us back to the LB_CONT, which
4089 // will retry the match starting at another position or succeed
4090 // the look-behind altogether, whichever is appropriate.
4091 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4092 break;
4093 }
4094
4095 // Look-behind expression matched, which means look-behind test as
4096 // a whole Fails
4097
4098 // Restore the orignal input string length, which had been truncated
4099 // inorder to pin the end of the lookbehind match
4100 // to the position being looked-behind.
4101 int64_t originalInputLen = fData[opValue+3];
4102 U_ASSERT(originalInputLen >= fActiveLimit);
4103 U_ASSERT(originalInputLen <= fInputLength);
4104 fActiveLimit = originalInputLen;
4105
4106 // Restore original stack position, discarding any state saved
4107 // by the successful pattern match.
4108 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
4109 int32_t newStackSize = (int32_t)fData[opValue];
4110 U_ASSERT(fStack->size() > newStackSize);
4111 fStack->setSize(newStackSize);
4112
4113 // FAIL, which will take control back to someplace
4114 // prior to entering the look-behind test.
4115 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4116 }
4117 break;
4118
4119
4120 case URX_LOOP_SR_I:
4121 // Loop Initialization for the optimized implementation of
4122 // [some character set]*
4123 // This op scans through all matching input.
4124 // The following LOOP_C op emulates stack unwinding if the following pattern fails.
4125 {
4126 U_ASSERT(opValue > 0 && opValue < fSets->size());
4127 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
4128 UnicodeSet *s = (UnicodeSet *)fSets->elementAt(opValue);
4129
4130 // Loop through input, until either the input is exhausted or
4131 // we reach a character that is not a member of the set.
4132 int64_t ix = fp->fInputIdx;
4133 UTEXT_SETNATIVEINDEX(fInputText, ix);
4134 for (;;) {
4135 if (ix >= fActiveLimit) {
4136 fHitEnd = TRUE;
4137 break;
4138 }
4139 UChar32 c = UTEXT_NEXT32(fInputText);
4140 if (c<256) {
4141 if (s8->contains(c) == FALSE) {
4142 break;
4143 }
4144 } else {
4145 if (s->contains(c) == FALSE) {
4146 break;
4147 }
4148 }
4149 ix = UTEXT_GETNATIVEINDEX(fInputText);
4150 }
4151
4152 // If there were no matching characters, skip over the loop altogether.
4153 // The loop doesn't run at all, a * op always succeeds.
4154 if (ix == fp->fInputIdx) {
4155 fp->fPatIdx++; // skip the URX_LOOP_C op.
4156 break;
4157 }
4158
4159 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
4160 // must follow. It's operand is the stack location
4161 // that holds the starting input index for the match of this [set]*
4162 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
4163 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
4164 int32_t stackLoc = URX_VAL(loopcOp);
4165 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
4166 fp->fExtra[stackLoc] = fp->fInputIdx;
4167 fp->fInputIdx = ix;
4168
4169 // Save State to the URX_LOOP_C op that follows this one,
4170 // so that match failures in the following code will return to there.
4171 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
4172 fp = StateSave(fp, fp->fPatIdx, status);
4173 fp->fPatIdx++;
4174 }
4175 break;
4176
4177
4178 case URX_LOOP_DOT_I:
4179 // Loop Initialization for the optimized implementation of .*
4180 // This op scans through all remaining input.
4181 // The following LOOP_C op emulates stack unwinding if the following pattern fails.
4182 {
4183 // Loop through input until the input is exhausted (we reach an end-of-line)
4184 // In DOTALL mode, we can just go straight to the end of the input.
4185 int64_t ix;
4186 if ((opValue & 1) == 1) {
4187 // Dot-matches-All mode. Jump straight to the end of the string.
4188 ix = fActiveLimit;
4189 fHitEnd = TRUE;
4190 } else {
4191 // NOT DOT ALL mode. Line endings do not match '.'
4192 // Scan forward until a line ending or end of input.
4193 ix = fp->fInputIdx;
4194 UTEXT_SETNATIVEINDEX(fInputText, ix);
4195 for (;;) {
4196 if (ix >= fActiveLimit) {
4197 fHitEnd = TRUE;
4198 break;
4199 }
4200 UChar32 c = UTEXT_NEXT32(fInputText);
4201 if ((c & 0x7f) <= 0x29) { // Fast filter of non-new-line-s
4202 if ((c == 0x0a) || // 0x0a is newline in both modes.
4203 (((opValue & 2) == 0) && // IF not UNIX_LINES mode
4204 isLineTerminator(c))) {
4205 // char is a line ending. Exit the scanning loop.
4206 break;
4207 }
4208 }
4209 ix = UTEXT_GETNATIVEINDEX(fInputText);
4210 }
4211 }
4212
4213 // If there were no matching characters, skip over the loop altogether.
4214 // The loop doesn't run at all, a * op always succeeds.
4215 if (ix == fp->fInputIdx) {
4216 fp->fPatIdx++; // skip the URX_LOOP_C op.
4217 break;
4218 }
4219
4220 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
4221 // must follow. It's operand is the stack location
4222 // that holds the starting input index for the match of this .*
4223 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
4224 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
4225 int32_t stackLoc = URX_VAL(loopcOp);
4226 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
4227 fp->fExtra[stackLoc] = fp->fInputIdx;
4228 fp->fInputIdx = ix;
4229
4230 // Save State to the URX_LOOP_C op that follows this one,
4231 // so that match failures in the following code will return to there.
4232 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
4233 fp = StateSave(fp, fp->fPatIdx, status);
4234 fp->fPatIdx++;
4235 }
4236 break;
4237
4238
4239 case URX_LOOP_C:
4240 {
4241 U_ASSERT(opValue>=0 && opValue<fFrameSize);
4242 backSearchIndex = fp->fExtra[opValue];
4243 U_ASSERT(backSearchIndex <= fp->fInputIdx);
4244 if (backSearchIndex == fp->fInputIdx) {
4245 // We've backed up the input idx to the point that the loop started.
4246 // The loop is done. Leave here without saving state.
4247 // Subsequent failures won't come back here.
4248 break;
4249 }
4250 // Set up for the next iteration of the loop, with input index
4251 // backed up by one from the last time through,
4252 // and a state save to this instruction in case the following code fails again.
4253 // (We're going backwards because this loop emulates stack unwinding, not
4254 // the initial scan forward.)
4255 U_ASSERT(fp->fInputIdx > 0);
4256 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
4257 UChar32 prevC = UTEXT_PREVIOUS32(fInputText);
4258 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
4259
4260 UChar32 twoPrevC = UTEXT_PREVIOUS32(fInputText);
4261 if (prevC == 0x0a &&
4262 fp->fInputIdx > backSearchIndex &&
4263 twoPrevC == 0x0d) {
4264 int32_t prevOp = (int32_t)pat[fp->fPatIdx-2];
4265 if (URX_TYPE(prevOp) == URX_LOOP_DOT_I) {
4266 // .*, stepping back over CRLF pair.
4267 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
4268 }
4269 }
4270
4271
4272 fp = StateSave(fp, fp->fPatIdx-1, status);
4273 }
4274 break;
4275
4276
4277
4278 default:
4279 // Trouble. The compiled pattern contains an entry with an
4280 // unrecognized type tag.
4281 U_ASSERT(FALSE);
4282 }
4283
4284 if (U_FAILURE(status)) {
4285 isMatch = FALSE;
4286 break;
4287 }
4288 }
4289
4290 breakFromLoop:
4291 fMatch = isMatch;
4292 if (isMatch) {
4293 fLastMatchEnd = fMatchEnd;
4294 fMatchStart = startIdx;
4295 fMatchEnd = fp->fInputIdx;
4296 }
4297
4298 #ifdef REGEX_RUN_DEBUG
4299 if (fTraceDebug) {
4300 if (isMatch) {
4301 printf("Match. start=%ld end=%ld\n\n", fMatchStart, fMatchEnd);
4302 } else {
4303 printf("No match\n\n");
4304 }
4305 }
4306 #endif
4307
4308 fFrame = fp; // The active stack frame when the engine stopped.
4309 // Contains the capture group results that we need to
4310 // access later.
4311 return;
4312 }
4313
4314
4315 //--------------------------------------------------------------------------------
4316 //
4317 // MatchChunkAt This is the actual matching engine. Like MatchAt, but with the
4318 // assumption that the entire string is available in the UText's
4319 // chunk buffer. For now, that means we can use int32_t indexes,
4320 // except for anything that needs to be saved (like group starts
4321 // and ends).
4322 //
4323 // startIdx: begin matching a this index.
4324 // toEnd: if true, match must extend to end of the input region
4325 //
4326 //--------------------------------------------------------------------------------
MatchChunkAt(int32_t startIdx,UBool toEnd,UErrorCode & status)4327 void RegexMatcher::MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &status) {
4328 UBool isMatch = FALSE; // True if the we have a match.
4329
4330 int32_t backSearchIndex = INT32_MAX; // used after greedy single-character matches for searching backwards
4331
4332 int32_t op; // Operation from the compiled pattern, split into
4333 int32_t opType; // the opcode
4334 int32_t opValue; // and the operand value.
4335
4336 #ifdef REGEX_RUN_DEBUG
4337 if (fTraceDebug) {
4338 printf("MatchAt(startIdx=%d)\n", startIdx);
4339 printf("Original Pattern: \"%s\"\n", CStr(StringFromUText(fPattern->fPattern))());
4340 printf("Input String: \"%s\"\n\n", CStr(StringFromUText(fInputText))());
4341 }
4342 #endif
4343
4344 if (U_FAILURE(status)) {
4345 return;
4346 }
4347
4348 // Cache frequently referenced items from the compiled pattern
4349 //
4350 int64_t *pat = fPattern->fCompiledPat->getBuffer();
4351
4352 const UChar *litText = fPattern->fLiteralText.getBuffer();
4353 UVector *fSets = fPattern->fSets;
4354
4355 const UChar *inputBuf = fInputText->chunkContents;
4356
4357 fFrameSize = fPattern->fFrameSize;
4358 REStackFrame *fp = resetStack();
4359 if (U_FAILURE(fDeferredStatus)) {
4360 status = fDeferredStatus;
4361 return;
4362 }
4363
4364 fp->fPatIdx = 0;
4365 fp->fInputIdx = startIdx;
4366
4367 // Zero out the pattern's static data
4368 int32_t i;
4369 for (i = 0; i<fPattern->fDataSize; i++) {
4370 fData[i] = 0;
4371 }
4372
4373 //
4374 // Main loop for interpreting the compiled pattern.
4375 // One iteration of the loop per pattern operation performed.
4376 //
4377 for (;;) {
4378 op = (int32_t)pat[fp->fPatIdx];
4379 opType = URX_TYPE(op);
4380 opValue = URX_VAL(op);
4381 #ifdef REGEX_RUN_DEBUG
4382 if (fTraceDebug) {
4383 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
4384 printf("inputIdx=%ld inputChar=%x sp=%3ld activeLimit=%ld ", fp->fInputIdx,
4385 UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(), fActiveLimit);
4386 fPattern->dumpOp(fp->fPatIdx);
4387 }
4388 #endif
4389 fp->fPatIdx++;
4390
4391 switch (opType) {
4392
4393
4394 case URX_NOP:
4395 break;
4396
4397
4398 case URX_BACKTRACK:
4399 // Force a backtrack. In some circumstances, the pattern compiler
4400 // will notice that the pattern can't possibly match anything, and will
4401 // emit one of these at that point.
4402 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4403 break;
4404
4405
4406 case URX_ONECHAR:
4407 if (fp->fInputIdx < fActiveLimit) {
4408 UChar32 c;
4409 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4410 if (c == opValue) {
4411 break;
4412 }
4413 } else {
4414 fHitEnd = TRUE;
4415 }
4416 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4417 break;
4418
4419
4420 case URX_STRING:
4421 {
4422 // Test input against a literal string.
4423 // Strings require two slots in the compiled pattern, one for the
4424 // offset to the string text, and one for the length.
4425 int32_t stringStartIdx = opValue;
4426 int32_t stringLen;
4427
4428 op = (int32_t)pat[fp->fPatIdx]; // Fetch the second operand
4429 fp->fPatIdx++;
4430 opType = URX_TYPE(op);
4431 stringLen = URX_VAL(op);
4432 U_ASSERT(opType == URX_STRING_LEN);
4433 U_ASSERT(stringLen >= 2);
4434
4435 const UChar * pInp = inputBuf + fp->fInputIdx;
4436 const UChar * pInpLimit = inputBuf + fActiveLimit;
4437 const UChar * pPat = litText+stringStartIdx;
4438 const UChar * pEnd = pInp + stringLen;
4439 UBool success = TRUE;
4440 while (pInp < pEnd) {
4441 if (pInp >= pInpLimit) {
4442 fHitEnd = TRUE;
4443 success = FALSE;
4444 break;
4445 }
4446 if (*pInp++ != *pPat++) {
4447 success = FALSE;
4448 break;
4449 }
4450 }
4451
4452 if (success) {
4453 fp->fInputIdx += stringLen;
4454 } else {
4455 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4456 }
4457 }
4458 break;
4459
4460
4461 case URX_STATE_SAVE:
4462 fp = StateSave(fp, opValue, status);
4463 break;
4464
4465
4466 case URX_END:
4467 // The match loop will exit via this path on a successful match,
4468 // when we reach the end of the pattern.
4469 if (toEnd && fp->fInputIdx != fActiveLimit) {
4470 // The pattern matched, but not to the end of input. Try some more.
4471 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4472 break;
4473 }
4474 isMatch = TRUE;
4475 goto breakFromLoop;
4476
4477 // Start and End Capture stack frame variables are laid out out like this:
4478 // fp->fExtra[opValue] - The start of a completed capture group
4479 // opValue+1 - The end of a completed capture group
4480 // opValue+2 - the start of a capture group whose end
4481 // has not yet been reached (and might not ever be).
4482 case URX_START_CAPTURE:
4483 U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
4484 fp->fExtra[opValue+2] = fp->fInputIdx;
4485 break;
4486
4487
4488 case URX_END_CAPTURE:
4489 U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
4490 U_ASSERT(fp->fExtra[opValue+2] >= 0); // Start pos for this group must be set.
4491 fp->fExtra[opValue] = fp->fExtra[opValue+2]; // Tentative start becomes real.
4492 fp->fExtra[opValue+1] = fp->fInputIdx; // End position
4493 U_ASSERT(fp->fExtra[opValue] <= fp->fExtra[opValue+1]);
4494 break;
4495
4496
4497 case URX_DOLLAR: // $, test for End of line
4498 // or for position before new line at end of input
4499 if (fp->fInputIdx < fAnchorLimit-2) {
4500 // We are no where near the end of input. Fail.
4501 // This is the common case. Keep it first.
4502 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4503 break;
4504 }
4505 if (fp->fInputIdx >= fAnchorLimit) {
4506 // We really are at the end of input. Success.
4507 fHitEnd = TRUE;
4508 fRequireEnd = TRUE;
4509 break;
4510 }
4511
4512 // If we are positioned just before a new-line that is located at the
4513 // end of input, succeed.
4514 if (fp->fInputIdx == fAnchorLimit-1) {
4515 UChar32 c;
4516 U16_GET(inputBuf, fAnchorStart, fp->fInputIdx, fAnchorLimit, c);
4517
4518 if (isLineTerminator(c)) {
4519 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp->fInputIdx-1]==0x0d)) {
4520 // At new-line at end of input. Success
4521 fHitEnd = TRUE;
4522 fRequireEnd = TRUE;
4523 break;
4524 }
4525 }
4526 } else if (fp->fInputIdx == fAnchorLimit-2 &&
4527 inputBuf[fp->fInputIdx]==0x0d && inputBuf[fp->fInputIdx+1]==0x0a) {
4528 fHitEnd = TRUE;
4529 fRequireEnd = TRUE;
4530 break; // At CR/LF at end of input. Success
4531 }
4532
4533 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4534
4535 break;
4536
4537
4538 case URX_DOLLAR_D: // $, test for End of Line, in UNIX_LINES mode.
4539 if (fp->fInputIdx >= fAnchorLimit-1) {
4540 // Either at the last character of input, or off the end.
4541 if (fp->fInputIdx == fAnchorLimit-1) {
4542 // At last char of input. Success if it's a new line.
4543 if (inputBuf[fp->fInputIdx] == 0x0a) {
4544 fHitEnd = TRUE;
4545 fRequireEnd = TRUE;
4546 break;
4547 }
4548 } else {
4549 // Off the end of input. Success.
4550 fHitEnd = TRUE;
4551 fRequireEnd = TRUE;
4552 break;
4553 }
4554 }
4555
4556 // Not at end of input. Back-track out.
4557 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4558 break;
4559
4560
4561 case URX_DOLLAR_M: // $, test for End of line in multi-line mode
4562 {
4563 if (fp->fInputIdx >= fAnchorLimit) {
4564 // We really are at the end of input. Success.
4565 fHitEnd = TRUE;
4566 fRequireEnd = TRUE;
4567 break;
4568 }
4569 // If we are positioned just before a new-line, succeed.
4570 // It makes no difference where the new-line is within the input.
4571 UChar32 c = inputBuf[fp->fInputIdx];
4572 if (isLineTerminator(c)) {
4573 // At a line end, except for the odd chance of being in the middle of a CR/LF sequence
4574 // In multi-line mode, hitting a new-line just before the end of input does not
4575 // set the hitEnd or requireEnd flags
4576 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp->fInputIdx-1]==0x0d)) {
4577 break;
4578 }
4579 }
4580 // not at a new line. Fail.
4581 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4582 }
4583 break;
4584
4585
4586 case URX_DOLLAR_MD: // $, test for End of line in multi-line and UNIX_LINES mode
4587 {
4588 if (fp->fInputIdx >= fAnchorLimit) {
4589 // We really are at the end of input. Success.
4590 fHitEnd = TRUE;
4591 fRequireEnd = TRUE; // Java set requireEnd in this case, even though
4592 break; // adding a new-line would not lose the match.
4593 }
4594 // If we are not positioned just before a new-line, the test fails; backtrack out.
4595 // It makes no difference where the new-line is within the input.
4596 if (inputBuf[fp->fInputIdx] != 0x0a) {
4597 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4598 }
4599 }
4600 break;
4601
4602
4603 case URX_CARET: // ^, test for start of line
4604 if (fp->fInputIdx != fAnchorStart) {
4605 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4606 }
4607 break;
4608
4609
4610 case URX_CARET_M: // ^, test for start of line in mulit-line mode
4611 {
4612 if (fp->fInputIdx == fAnchorStart) {
4613 // We are at the start input. Success.
4614 break;
4615 }
4616 // Check whether character just before the current pos is a new-line
4617 // unless we are at the end of input
4618 UChar c = inputBuf[fp->fInputIdx - 1];
4619 if ((fp->fInputIdx < fAnchorLimit) &&
4620 isLineTerminator(c)) {
4621 // It's a new-line. ^ is true. Success.
4622 // TODO: what should be done with positions between a CR and LF?
4623 break;
4624 }
4625 // Not at the start of a line. Fail.
4626 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4627 }
4628 break;
4629
4630
4631 case URX_CARET_M_UNIX: // ^, test for start of line in mulit-line + Unix-line mode
4632 {
4633 U_ASSERT(fp->fInputIdx >= fAnchorStart);
4634 if (fp->fInputIdx <= fAnchorStart) {
4635 // We are at the start input. Success.
4636 break;
4637 }
4638 // Check whether character just before the current pos is a new-line
4639 U_ASSERT(fp->fInputIdx <= fAnchorLimit);
4640 UChar c = inputBuf[fp->fInputIdx - 1];
4641 if (c != 0x0a) {
4642 // Not at the start of a line. Back-track out.
4643 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4644 }
4645 }
4646 break;
4647
4648 case URX_BACKSLASH_B: // Test for word boundaries
4649 {
4650 UBool success = isChunkWordBoundary((int32_t)fp->fInputIdx);
4651 success ^= (UBool)(opValue != 0); // flip sense for \B
4652 if (!success) {
4653 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4654 }
4655 }
4656 break;
4657
4658
4659 case URX_BACKSLASH_BU: // Test for word boundaries, Unicode-style
4660 {
4661 UBool success = isUWordBoundary(fp->fInputIdx);
4662 success ^= (UBool)(opValue != 0); // flip sense for \B
4663 if (!success) {
4664 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4665 }
4666 }
4667 break;
4668
4669
4670 case URX_BACKSLASH_D: // Test for decimal digit
4671 {
4672 if (fp->fInputIdx >= fActiveLimit) {
4673 fHitEnd = TRUE;
4674 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4675 break;
4676 }
4677
4678 UChar32 c;
4679 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4680 int8_t ctype = u_charType(c); // TODO: make a unicode set for this. Will be faster.
4681 UBool success = (ctype == U_DECIMAL_DIGIT_NUMBER);
4682 success ^= (UBool)(opValue != 0); // flip sense for \D
4683 if (!success) {
4684 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4685 }
4686 }
4687 break;
4688
4689
4690 case URX_BACKSLASH_G: // Test for position at end of previous match
4691 if (!((fMatch && fp->fInputIdx==fMatchEnd) || (fMatch==FALSE && fp->fInputIdx==fActiveStart))) {
4692 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4693 }
4694 break;
4695
4696
4697 case URX_BACKSLASH_H: // Test for \h, horizontal white space.
4698 {
4699 if (fp->fInputIdx >= fActiveLimit) {
4700 fHitEnd = TRUE;
4701 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4702 break;
4703 }
4704 UChar32 c;
4705 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4706 int8_t ctype = u_charType(c);
4707 UBool success = (ctype == U_SPACE_SEPARATOR || c == 9); // SPACE_SEPARATOR || TAB
4708 success ^= (UBool)(opValue != 0); // flip sense for \H
4709 if (!success) {
4710 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4711 }
4712 }
4713 break;
4714
4715
4716 case URX_BACKSLASH_R: // Test for \R, any line break sequence.
4717 {
4718 if (fp->fInputIdx >= fActiveLimit) {
4719 fHitEnd = TRUE;
4720 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4721 break;
4722 }
4723 UChar32 c;
4724 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4725 if (isLineTerminator(c)) {
4726 if (c == 0x0d && fp->fInputIdx < fActiveLimit) {
4727 // Check for CR/LF sequence. Consume both together when found.
4728 UChar c2;
4729 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c2);
4730 if (c2 != 0x0a) {
4731 U16_PREV(inputBuf, 0, fp->fInputIdx, c2);
4732 }
4733 }
4734 } else {
4735 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4736 }
4737 }
4738 break;
4739
4740
4741 case URX_BACKSLASH_V: // Any single code point line ending.
4742 {
4743 if (fp->fInputIdx >= fActiveLimit) {
4744 fHitEnd = TRUE;
4745 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4746 break;
4747 }
4748 UChar32 c;
4749 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4750 UBool success = isLineTerminator(c);
4751 success ^= (UBool)(opValue != 0); // flip sense for \V
4752 if (!success) {
4753 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4754 }
4755 }
4756 break;
4757
4758
4759
4760 case URX_BACKSLASH_X:
4761 // Match a Grapheme, as defined by Unicode TR 29.
4762 // Differs slightly from Perl, which consumes combining marks independently
4763 // of context.
4764 {
4765
4766 // Fail if at end of input
4767 if (fp->fInputIdx >= fActiveLimit) {
4768 fHitEnd = TRUE;
4769 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4770 break;
4771 }
4772
4773 // Examine (and consume) the current char.
4774 // Dispatch into a little state machine, based on the char.
4775 UChar32 c;
4776 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4777 UnicodeSet **sets = fPattern->fStaticSets;
4778 if (sets[URX_GC_NORMAL]->contains(c)) goto GC_Extend;
4779 if (sets[URX_GC_CONTROL]->contains(c)) goto GC_Control;
4780 if (sets[URX_GC_L]->contains(c)) goto GC_L;
4781 if (sets[URX_GC_LV]->contains(c)) goto GC_V;
4782 if (sets[URX_GC_LVT]->contains(c)) goto GC_T;
4783 if (sets[URX_GC_V]->contains(c)) goto GC_V;
4784 if (sets[URX_GC_T]->contains(c)) goto GC_T;
4785 goto GC_Extend;
4786
4787
4788
4789 GC_L:
4790 if (fp->fInputIdx >= fActiveLimit) goto GC_Done;
4791 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4792 if (sets[URX_GC_L]->contains(c)) goto GC_L;
4793 if (sets[URX_GC_LV]->contains(c)) goto GC_V;
4794 if (sets[URX_GC_LVT]->contains(c)) goto GC_T;
4795 if (sets[URX_GC_V]->contains(c)) goto GC_V;
4796 U16_PREV(inputBuf, 0, fp->fInputIdx, c);
4797 goto GC_Extend;
4798
4799 GC_V:
4800 if (fp->fInputIdx >= fActiveLimit) goto GC_Done;
4801 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4802 if (sets[URX_GC_V]->contains(c)) goto GC_V;
4803 if (sets[URX_GC_T]->contains(c)) goto GC_T;
4804 U16_PREV(inputBuf, 0, fp->fInputIdx, c);
4805 goto GC_Extend;
4806
4807 GC_T:
4808 if (fp->fInputIdx >= fActiveLimit) goto GC_Done;
4809 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4810 if (sets[URX_GC_T]->contains(c)) goto GC_T;
4811 U16_PREV(inputBuf, 0, fp->fInputIdx, c);
4812 goto GC_Extend;
4813
4814 GC_Extend:
4815 // Combining characters are consumed here
4816 for (;;) {
4817 if (fp->fInputIdx >= fActiveLimit) {
4818 break;
4819 }
4820 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4821 if (sets[URX_GC_EXTEND]->contains(c) == FALSE) {
4822 U16_BACK_1(inputBuf, 0, fp->fInputIdx);
4823 break;
4824 }
4825 }
4826 goto GC_Done;
4827
4828 GC_Control:
4829 // Most control chars stand alone (don't combine with combining chars),
4830 // except for that CR/LF sequence is a single grapheme cluster.
4831 if (c == 0x0d && fp->fInputIdx < fActiveLimit && inputBuf[fp->fInputIdx] == 0x0a) {
4832 fp->fInputIdx++;
4833 }
4834
4835 GC_Done:
4836 if (fp->fInputIdx >= fActiveLimit) {
4837 fHitEnd = TRUE;
4838 }
4839 break;
4840 }
4841
4842
4843
4844
4845 case URX_BACKSLASH_Z: // Test for end of Input
4846 if (fp->fInputIdx < fAnchorLimit) {
4847 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4848 } else {
4849 fHitEnd = TRUE;
4850 fRequireEnd = TRUE;
4851 }
4852 break;
4853
4854
4855
4856 case URX_STATIC_SETREF:
4857 {
4858 // Test input character against one of the predefined sets
4859 // (Word Characters, for example)
4860 // The high bit of the op value is a flag for the match polarity.
4861 // 0: success if input char is in set.
4862 // 1: success if input char is not in set.
4863 if (fp->fInputIdx >= fActiveLimit) {
4864 fHitEnd = TRUE;
4865 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4866 break;
4867 }
4868
4869 UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET);
4870 opValue &= ~URX_NEG_SET;
4871 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
4872
4873 UChar32 c;
4874 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4875 if (c < 256) {
4876 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
4877 if (s8->contains(c)) {
4878 success = !success;
4879 }
4880 } else {
4881 const UnicodeSet *s = fPattern->fStaticSets[opValue];
4882 if (s->contains(c)) {
4883 success = !success;
4884 }
4885 }
4886 if (!success) {
4887 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4888 }
4889 }
4890 break;
4891
4892
4893 case URX_STAT_SETREF_N:
4894 {
4895 // Test input character for NOT being a member of one of
4896 // the predefined sets (Word Characters, for example)
4897 if (fp->fInputIdx >= fActiveLimit) {
4898 fHitEnd = TRUE;
4899 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4900 break;
4901 }
4902
4903 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
4904
4905 UChar32 c;
4906 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4907 if (c < 256) {
4908 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
4909 if (s8->contains(c) == FALSE) {
4910 break;
4911 }
4912 } else {
4913 const UnicodeSet *s = fPattern->fStaticSets[opValue];
4914 if (s->contains(c) == FALSE) {
4915 break;
4916 }
4917 }
4918 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4919 }
4920 break;
4921
4922
4923 case URX_SETREF:
4924 {
4925 if (fp->fInputIdx >= fActiveLimit) {
4926 fHitEnd = TRUE;
4927 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4928 break;
4929 }
4930
4931 U_ASSERT(opValue > 0 && opValue < fSets->size());
4932
4933 // There is input left. Pick up one char and test it for set membership.
4934 UChar32 c;
4935 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4936 if (c<256) {
4937 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
4938 if (s8->contains(c)) {
4939 // The character is in the set. A Match.
4940 break;
4941 }
4942 } else {
4943 UnicodeSet *s = (UnicodeSet *)fSets->elementAt(opValue);
4944 if (s->contains(c)) {
4945 // The character is in the set. A Match.
4946 break;
4947 }
4948 }
4949
4950 // the character wasn't in the set.
4951 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4952 }
4953 break;
4954
4955
4956 case URX_DOTANY:
4957 {
4958 // . matches anything, but stops at end-of-line.
4959 if (fp->fInputIdx >= fActiveLimit) {
4960 // At end of input. Match failed. Backtrack out.
4961 fHitEnd = TRUE;
4962 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4963 break;
4964 }
4965
4966 // There is input left. Advance over one char, unless we've hit end-of-line
4967 UChar32 c;
4968 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4969 if (isLineTerminator(c)) {
4970 // End of line in normal mode. . does not match.
4971 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4972 break;
4973 }
4974 }
4975 break;
4976
4977
4978 case URX_DOTANY_ALL:
4979 {
4980 // . in dot-matches-all (including new lines) mode
4981 if (fp->fInputIdx >= fActiveLimit) {
4982 // At end of input. Match failed. Backtrack out.
4983 fHitEnd = TRUE;
4984 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4985 break;
4986 }
4987
4988 // There is input left. Advance over one char, except if we are
4989 // at a cr/lf, advance over both of them.
4990 UChar32 c;
4991 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4992 if (c==0x0d && fp->fInputIdx < fActiveLimit) {
4993 // In the case of a CR/LF, we need to advance over both.
4994 if (inputBuf[fp->fInputIdx] == 0x0a) {
4995 U16_FWD_1(inputBuf, fp->fInputIdx, fActiveLimit);
4996 }
4997 }
4998 }
4999 break;
5000
5001
5002 case URX_DOTANY_UNIX:
5003 {
5004 // '.' operator, matches all, but stops at end-of-line.
5005 // UNIX_LINES mode, so 0x0a is the only recognized line ending.
5006 if (fp->fInputIdx >= fActiveLimit) {
5007 // At end of input. Match failed. Backtrack out.
5008 fHitEnd = TRUE;
5009 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5010 break;
5011 }
5012
5013 // There is input left. Advance over one char, unless we've hit end-of-line
5014 UChar32 c;
5015 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
5016 if (c == 0x0a) {
5017 // End of line in normal mode. '.' does not match the \n
5018 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5019 }
5020 }
5021 break;
5022
5023
5024 case URX_JMP:
5025 fp->fPatIdx = opValue;
5026 break;
5027
5028 case URX_FAIL:
5029 isMatch = FALSE;
5030 goto breakFromLoop;
5031
5032 case URX_JMP_SAV:
5033 U_ASSERT(opValue < fPattern->fCompiledPat->size());
5034 fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current
5035 fp->fPatIdx = opValue; // Then JMP.
5036 break;
5037
5038 case URX_JMP_SAV_X:
5039 // This opcode is used with (x)+, when x can match a zero length string.
5040 // Same as JMP_SAV, except conditional on the match having made forward progress.
5041 // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the
5042 // data address of the input position at the start of the loop.
5043 {
5044 U_ASSERT(opValue > 0 && opValue < fPattern->fCompiledPat->size());
5045 int32_t stoOp = (int32_t)pat[opValue-1];
5046 U_ASSERT(URX_TYPE(stoOp) == URX_STO_INP_LOC);
5047 int32_t frameLoc = URX_VAL(stoOp);
5048 U_ASSERT(frameLoc >= 0 && frameLoc < fFrameSize);
5049 int32_t prevInputIdx = (int32_t)fp->fExtra[frameLoc];
5050 U_ASSERT(prevInputIdx <= fp->fInputIdx);
5051 if (prevInputIdx < fp->fInputIdx) {
5052 // The match did make progress. Repeat the loop.
5053 fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current
5054 fp->fPatIdx = opValue;
5055 fp->fExtra[frameLoc] = fp->fInputIdx;
5056 }
5057 // If the input position did not advance, we do nothing here,
5058 // execution will fall out of the loop.
5059 }
5060 break;
5061
5062 case URX_CTR_INIT:
5063 {
5064 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
5065 fp->fExtra[opValue] = 0; // Set the loop counter variable to zero
5066
5067 // Pick up the three extra operands that CTR_INIT has, and
5068 // skip the pattern location counter past
5069 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
5070 fp->fPatIdx += 3;
5071 int32_t loopLoc = URX_VAL(pat[instrOperandLoc]);
5072 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
5073 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
5074 U_ASSERT(minCount>=0);
5075 U_ASSERT(maxCount>=minCount || maxCount==-1);
5076 U_ASSERT(loopLoc>=fp->fPatIdx);
5077
5078 if (minCount == 0) {
5079 fp = StateSave(fp, loopLoc+1, status);
5080 }
5081 if (maxCount == -1) {
5082 fp->fExtra[opValue+1] = fp->fInputIdx; // For loop breaking.
5083 } else if (maxCount == 0) {
5084 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5085 }
5086 }
5087 break;
5088
5089 case URX_CTR_LOOP:
5090 {
5091 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
5092 int32_t initOp = (int32_t)pat[opValue];
5093 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT);
5094 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
5095 int32_t minCount = (int32_t)pat[opValue+2];
5096 int32_t maxCount = (int32_t)pat[opValue+3];
5097 (*pCounter)++;
5098 if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) {
5099 U_ASSERT(*pCounter == maxCount);
5100 break;
5101 }
5102 if (*pCounter >= minCount) {
5103 if (maxCount == -1) {
5104 // Loop has no hard upper bound.
5105 // Check that it is progressing through the input, break if it is not.
5106 int64_t *pLastInputIdx = &fp->fExtra[URX_VAL(initOp) + 1];
5107 if (fp->fInputIdx == *pLastInputIdx) {
5108 break;
5109 } else {
5110 *pLastInputIdx = fp->fInputIdx;
5111 }
5112 }
5113 fp = StateSave(fp, fp->fPatIdx, status);
5114 } else {
5115 // Increment time-out counter. (StateSave() does it if count >= minCount)
5116 fTickCounter--;
5117 if (fTickCounter <= 0) {
5118 IncrementTime(status); // Re-initializes fTickCounter
5119 }
5120 }
5121 fp->fPatIdx = opValue + 4; // Loop back.
5122 }
5123 break;
5124
5125 case URX_CTR_INIT_NG:
5126 {
5127 // Initialize a non-greedy loop
5128 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
5129 fp->fExtra[opValue] = 0; // Set the loop counter variable to zero
5130
5131 // Pick up the three extra operands that CTR_INIT_NG has, and
5132 // skip the pattern location counter past
5133 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
5134 fp->fPatIdx += 3;
5135 int32_t loopLoc = URX_VAL(pat[instrOperandLoc]);
5136 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
5137 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
5138 U_ASSERT(minCount>=0);
5139 U_ASSERT(maxCount>=minCount || maxCount==-1);
5140 U_ASSERT(loopLoc>fp->fPatIdx);
5141 if (maxCount == -1) {
5142 fp->fExtra[opValue+1] = fp->fInputIdx; // Save initial input index for loop breaking.
5143 }
5144
5145 if (minCount == 0) {
5146 if (maxCount != 0) {
5147 fp = StateSave(fp, fp->fPatIdx, status);
5148 }
5149 fp->fPatIdx = loopLoc+1; // Continue with stuff after repeated block
5150 }
5151 }
5152 break;
5153
5154 case URX_CTR_LOOP_NG:
5155 {
5156 // Non-greedy {min, max} loops
5157 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
5158 int32_t initOp = (int32_t)pat[opValue];
5159 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT_NG);
5160 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
5161 int32_t minCount = (int32_t)pat[opValue+2];
5162 int32_t maxCount = (int32_t)pat[opValue+3];
5163
5164 (*pCounter)++;
5165 if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) {
5166 // The loop has matched the maximum permitted number of times.
5167 // Break out of here with no action. Matching will
5168 // continue with the following pattern.
5169 U_ASSERT(*pCounter == maxCount);
5170 break;
5171 }
5172
5173 if (*pCounter < minCount) {
5174 // We haven't met the minimum number of matches yet.
5175 // Loop back for another one.
5176 fp->fPatIdx = opValue + 4; // Loop back.
5177 fTickCounter--;
5178 if (fTickCounter <= 0) {
5179 IncrementTime(status); // Re-initializes fTickCounter
5180 }
5181 } else {
5182 // We do have the minimum number of matches.
5183
5184 // If there is no upper bound on the loop iterations, check that the input index
5185 // is progressing, and stop the loop if it is not.
5186 if (maxCount == -1) {
5187 int64_t *pLastInputIdx = &fp->fExtra[URX_VAL(initOp) + 1];
5188 if (fp->fInputIdx == *pLastInputIdx) {
5189 break;
5190 }
5191 *pLastInputIdx = fp->fInputIdx;
5192 }
5193
5194 // Loop Continuation: we will fall into the pattern following the loop
5195 // (non-greedy, don't execute loop body first), but first do
5196 // a state save to the top of the loop, so that a match failure
5197 // in the following pattern will try another iteration of the loop.
5198 fp = StateSave(fp, opValue + 4, status);
5199 }
5200 }
5201 break;
5202
5203 case URX_STO_SP:
5204 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
5205 fData[opValue] = fStack->size();
5206 break;
5207
5208 case URX_LD_SP:
5209 {
5210 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
5211 int32_t newStackSize = (int32_t)fData[opValue];
5212 U_ASSERT(newStackSize <= fStack->size());
5213 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
5214 if (newFP == (int64_t *)fp) {
5215 break;
5216 }
5217 int32_t j;
5218 for (j=0; j<fFrameSize; j++) {
5219 newFP[j] = ((int64_t *)fp)[j];
5220 }
5221 fp = (REStackFrame *)newFP;
5222 fStack->setSize(newStackSize);
5223 }
5224 break;
5225
5226 case URX_BACKREF:
5227 {
5228 U_ASSERT(opValue < fFrameSize);
5229 int64_t groupStartIdx = fp->fExtra[opValue];
5230 int64_t groupEndIdx = fp->fExtra[opValue+1];
5231 U_ASSERT(groupStartIdx <= groupEndIdx);
5232 int64_t inputIndex = fp->fInputIdx;
5233 if (groupStartIdx < 0) {
5234 // This capture group has not participated in the match thus far,
5235 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match.
5236 break;
5237 }
5238 UBool success = TRUE;
5239 for (int64_t groupIndex = groupStartIdx; groupIndex < groupEndIdx; ++groupIndex,++inputIndex) {
5240 if (inputIndex >= fActiveLimit) {
5241 success = FALSE;
5242 fHitEnd = TRUE;
5243 break;
5244 }
5245 if (inputBuf[groupIndex] != inputBuf[inputIndex]) {
5246 success = FALSE;
5247 break;
5248 }
5249 }
5250 if (success && groupStartIdx < groupEndIdx && U16_IS_LEAD(inputBuf[groupEndIdx-1]) &&
5251 inputIndex < fActiveLimit && U16_IS_TRAIL(inputBuf[inputIndex])) {
5252 // Capture group ended with an unpaired lead surrogate.
5253 // Back reference is not permitted to match lead only of a surrogatge pair.
5254 success = FALSE;
5255 }
5256 if (success) {
5257 fp->fInputIdx = inputIndex;
5258 } else {
5259 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5260 }
5261 }
5262 break;
5263
5264 case URX_BACKREF_I:
5265 {
5266 U_ASSERT(opValue < fFrameSize);
5267 int64_t groupStartIdx = fp->fExtra[opValue];
5268 int64_t groupEndIdx = fp->fExtra[opValue+1];
5269 U_ASSERT(groupStartIdx <= groupEndIdx);
5270 if (groupStartIdx < 0) {
5271 // This capture group has not participated in the match thus far,
5272 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match.
5273 break;
5274 }
5275 CaseFoldingUCharIterator captureGroupItr(inputBuf, groupStartIdx, groupEndIdx);
5276 CaseFoldingUCharIterator inputItr(inputBuf, fp->fInputIdx, fActiveLimit);
5277
5278 // Note: if the capture group match was of an empty string the backref
5279 // match succeeds. Verified by testing: Perl matches succeed
5280 // in this case, so we do too.
5281
5282 UBool success = TRUE;
5283 for (;;) {
5284 UChar32 captureGroupChar = captureGroupItr.next();
5285 if (captureGroupChar == U_SENTINEL) {
5286 success = TRUE;
5287 break;
5288 }
5289 UChar32 inputChar = inputItr.next();
5290 if (inputChar == U_SENTINEL) {
5291 success = FALSE;
5292 fHitEnd = TRUE;
5293 break;
5294 }
5295 if (inputChar != captureGroupChar) {
5296 success = FALSE;
5297 break;
5298 }
5299 }
5300
5301 if (success && inputItr.inExpansion()) {
5302 // We otained a match by consuming part of a string obtained from
5303 // case-folding a single code point of the input text.
5304 // This does not count as an overall match.
5305 success = FALSE;
5306 }
5307
5308 if (success) {
5309 fp->fInputIdx = inputItr.getIndex();
5310 } else {
5311 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5312 }
5313 }
5314 break;
5315
5316 case URX_STO_INP_LOC:
5317 {
5318 U_ASSERT(opValue >= 0 && opValue < fFrameSize);
5319 fp->fExtra[opValue] = fp->fInputIdx;
5320 }
5321 break;
5322
5323 case URX_JMPX:
5324 {
5325 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
5326 fp->fPatIdx += 1;
5327 int32_t dataLoc = URX_VAL(pat[instrOperandLoc]);
5328 U_ASSERT(dataLoc >= 0 && dataLoc < fFrameSize);
5329 int32_t savedInputIdx = (int32_t)fp->fExtra[dataLoc];
5330 U_ASSERT(savedInputIdx <= fp->fInputIdx);
5331 if (savedInputIdx < fp->fInputIdx) {
5332 fp->fPatIdx = opValue; // JMP
5333 } else {
5334 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no progress in loop.
5335 }
5336 }
5337 break;
5338
5339 case URX_LA_START:
5340 {
5341 // Entering a lookahead block.
5342 // Save Stack Ptr, Input Pos.
5343 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5344 fData[opValue] = fStack->size();
5345 fData[opValue+1] = fp->fInputIdx;
5346 fActiveStart = fLookStart; // Set the match region change for
5347 fActiveLimit = fLookLimit; // transparent bounds.
5348 }
5349 break;
5350
5351 case URX_LA_END:
5352 {
5353 // Leaving a look-ahead block.
5354 // restore Stack Ptr, Input Pos to positions they had on entry to block.
5355 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5356 int32_t stackSize = fStack->size();
5357 int32_t newStackSize = (int32_t)fData[opValue];
5358 U_ASSERT(stackSize >= newStackSize);
5359 if (stackSize > newStackSize) {
5360 // Copy the current top frame back to the new (cut back) top frame.
5361 // This makes the capture groups from within the look-ahead
5362 // expression available.
5363 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
5364 int32_t j;
5365 for (j=0; j<fFrameSize; j++) {
5366 newFP[j] = ((int64_t *)fp)[j];
5367 }
5368 fp = (REStackFrame *)newFP;
5369 fStack->setSize(newStackSize);
5370 }
5371 fp->fInputIdx = fData[opValue+1];
5372
5373 // Restore the active region bounds in the input string; they may have
5374 // been changed because of transparent bounds on a Region.
5375 fActiveStart = fRegionStart;
5376 fActiveLimit = fRegionLimit;
5377 }
5378 break;
5379
5380 case URX_ONECHAR_I:
5381 if (fp->fInputIdx < fActiveLimit) {
5382 UChar32 c;
5383 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
5384 if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) {
5385 break;
5386 }
5387 } else {
5388 fHitEnd = TRUE;
5389 }
5390 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5391 break;
5392
5393 case URX_STRING_I:
5394 // Case-insensitive test input against a literal string.
5395 // Strings require two slots in the compiled pattern, one for the
5396 // offset to the string text, and one for the length.
5397 // The compiled string has already been case folded.
5398 {
5399 const UChar *patternString = litText + opValue;
5400
5401 op = (int32_t)pat[fp->fPatIdx];
5402 fp->fPatIdx++;
5403 opType = URX_TYPE(op);
5404 opValue = URX_VAL(op);
5405 U_ASSERT(opType == URX_STRING_LEN);
5406 int32_t patternStringLen = opValue; // Length of the string from the pattern.
5407
5408 UChar32 cText;
5409 UChar32 cPattern;
5410 UBool success = TRUE;
5411 int32_t patternStringIdx = 0;
5412 CaseFoldingUCharIterator inputIterator(inputBuf, fp->fInputIdx, fActiveLimit);
5413 while (patternStringIdx < patternStringLen) {
5414 U16_NEXT(patternString, patternStringIdx, patternStringLen, cPattern);
5415 cText = inputIterator.next();
5416 if (cText != cPattern) {
5417 success = FALSE;
5418 if (cText == U_SENTINEL) {
5419 fHitEnd = TRUE;
5420 }
5421 break;
5422 }
5423 }
5424 if (inputIterator.inExpansion()) {
5425 success = FALSE;
5426 }
5427
5428 if (success) {
5429 fp->fInputIdx = inputIterator.getIndex();
5430 } else {
5431 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5432 }
5433 }
5434 break;
5435
5436 case URX_LB_START:
5437 {
5438 // Entering a look-behind block.
5439 // Save Stack Ptr, Input Pos.
5440 // TODO: implement transparent bounds. Ticket #6067
5441 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5442 fData[opValue] = fStack->size();
5443 fData[opValue+1] = fp->fInputIdx;
5444 // Init the variable containing the start index for attempted matches.
5445 fData[opValue+2] = -1;
5446 // Save input string length, then reset to pin any matches to end at
5447 // the current position.
5448 fData[opValue+3] = fActiveLimit;
5449 fActiveLimit = fp->fInputIdx;
5450 }
5451 break;
5452
5453
5454 case URX_LB_CONT:
5455 {
5456 // Positive Look-Behind, at top of loop checking for matches of LB expression
5457 // at all possible input starting positions.
5458
5459 // Fetch the min and max possible match lengths. They are the operands
5460 // of this op in the pattern.
5461 int32_t minML = (int32_t)pat[fp->fPatIdx++];
5462 int32_t maxML = (int32_t)pat[fp->fPatIdx++];
5463 U_ASSERT(minML <= maxML);
5464 U_ASSERT(minML >= 0);
5465
5466 // Fetch (from data) the last input index where a match was attempted.
5467 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5468 int64_t &lbStartIdx = fData[opValue+2];
5469 if (lbStartIdx < 0) {
5470 // First time through loop.
5471 lbStartIdx = fp->fInputIdx - minML;
5472 if (lbStartIdx > 0 && lbStartIdx < fInputLength) {
5473 U16_SET_CP_START(inputBuf, 0, lbStartIdx);
5474 }
5475 } else {
5476 // 2nd through nth time through the loop.
5477 // Back up start position for match by one.
5478 if (lbStartIdx == 0) {
5479 lbStartIdx--;
5480 } else {
5481 U16_BACK_1(inputBuf, 0, lbStartIdx);
5482 }
5483 }
5484
5485 if (lbStartIdx < 0 || lbStartIdx < fp->fInputIdx - maxML) {
5486 // We have tried all potential match starting points without
5487 // getting a match. Backtrack out, and out of the
5488 // Look Behind altogether.
5489 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5490 int64_t restoreInputLen = fData[opValue+3];
5491 U_ASSERT(restoreInputLen >= fActiveLimit);
5492 U_ASSERT(restoreInputLen <= fInputLength);
5493 fActiveLimit = restoreInputLen;
5494 break;
5495 }
5496
5497 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
5498 // (successful match will fall off the end of the loop.)
5499 fp = StateSave(fp, fp->fPatIdx-3, status);
5500 fp->fInputIdx = lbStartIdx;
5501 }
5502 break;
5503
5504 case URX_LB_END:
5505 // End of a look-behind block, after a successful match.
5506 {
5507 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5508 if (fp->fInputIdx != fActiveLimit) {
5509 // The look-behind expression matched, but the match did not
5510 // extend all the way to the point that we are looking behind from.
5511 // FAIL out of here, which will take us back to the LB_CONT, which
5512 // will retry the match starting at another position or fail
5513 // the look-behind altogether, whichever is appropriate.
5514 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5515 break;
5516 }
5517
5518 // Look-behind match is good. Restore the orignal input string length,
5519 // which had been truncated to pin the end of the lookbehind match to the
5520 // position being looked-behind.
5521 int64_t originalInputLen = fData[opValue+3];
5522 U_ASSERT(originalInputLen >= fActiveLimit);
5523 U_ASSERT(originalInputLen <= fInputLength);
5524 fActiveLimit = originalInputLen;
5525 }
5526 break;
5527
5528
5529 case URX_LBN_CONT:
5530 {
5531 // Negative Look-Behind, at top of loop checking for matches of LB expression
5532 // at all possible input starting positions.
5533
5534 // Fetch the extra parameters of this op.
5535 int32_t minML = (int32_t)pat[fp->fPatIdx++];
5536 int32_t maxML = (int32_t)pat[fp->fPatIdx++];
5537 int32_t continueLoc = (int32_t)pat[fp->fPatIdx++];
5538 continueLoc = URX_VAL(continueLoc);
5539 U_ASSERT(minML <= maxML);
5540 U_ASSERT(minML >= 0);
5541 U_ASSERT(continueLoc > fp->fPatIdx);
5542
5543 // Fetch (from data) the last input index where a match was attempted.
5544 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5545 int64_t &lbStartIdx = fData[opValue+2];
5546 if (lbStartIdx < 0) {
5547 // First time through loop.
5548 lbStartIdx = fp->fInputIdx - minML;
5549 if (lbStartIdx > 0 && lbStartIdx < fInputLength) {
5550 U16_SET_CP_START(inputBuf, 0, lbStartIdx);
5551 }
5552 } else {
5553 // 2nd through nth time through the loop.
5554 // Back up start position for match by one.
5555 if (lbStartIdx == 0) {
5556 lbStartIdx--; // Because U16_BACK is unsafe starting at 0.
5557 } else {
5558 U16_BACK_1(inputBuf, 0, lbStartIdx);
5559 }
5560 }
5561
5562 if (lbStartIdx < 0 || lbStartIdx < fp->fInputIdx - maxML) {
5563 // We have tried all potential match starting points without
5564 // getting a match, which means that the negative lookbehind as
5565 // a whole has succeeded. Jump forward to the continue location
5566 int64_t restoreInputLen = fData[opValue+3];
5567 U_ASSERT(restoreInputLen >= fActiveLimit);
5568 U_ASSERT(restoreInputLen <= fInputLength);
5569 fActiveLimit = restoreInputLen;
5570 fp->fPatIdx = continueLoc;
5571 break;
5572 }
5573
5574 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
5575 // (successful match will cause a FAIL out of the loop altogether.)
5576 fp = StateSave(fp, fp->fPatIdx-4, status);
5577 fp->fInputIdx = lbStartIdx;
5578 }
5579 break;
5580
5581 case URX_LBN_END:
5582 // End of a negative look-behind block, after a successful match.
5583 {
5584 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5585 if (fp->fInputIdx != fActiveLimit) {
5586 // The look-behind expression matched, but the match did not
5587 // extend all the way to the point that we are looking behind from.
5588 // FAIL out of here, which will take us back to the LB_CONT, which
5589 // will retry the match starting at another position or succeed
5590 // the look-behind altogether, whichever is appropriate.
5591 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5592 break;
5593 }
5594
5595 // Look-behind expression matched, which means look-behind test as
5596 // a whole Fails
5597
5598 // Restore the orignal input string length, which had been truncated
5599 // inorder to pin the end of the lookbehind match
5600 // to the position being looked-behind.
5601 int64_t originalInputLen = fData[opValue+3];
5602 U_ASSERT(originalInputLen >= fActiveLimit);
5603 U_ASSERT(originalInputLen <= fInputLength);
5604 fActiveLimit = originalInputLen;
5605
5606 // Restore original stack position, discarding any state saved
5607 // by the successful pattern match.
5608 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5609 int32_t newStackSize = (int32_t)fData[opValue];
5610 U_ASSERT(fStack->size() > newStackSize);
5611 fStack->setSize(newStackSize);
5612
5613 // FAIL, which will take control back to someplace
5614 // prior to entering the look-behind test.
5615 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5616 }
5617 break;
5618
5619
5620 case URX_LOOP_SR_I:
5621 // Loop Initialization for the optimized implementation of
5622 // [some character set]*
5623 // This op scans through all matching input.
5624 // The following LOOP_C op emulates stack unwinding if the following pattern fails.
5625 {
5626 U_ASSERT(opValue > 0 && opValue < fSets->size());
5627 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
5628 UnicodeSet *s = (UnicodeSet *)fSets->elementAt(opValue);
5629
5630 // Loop through input, until either the input is exhausted or
5631 // we reach a character that is not a member of the set.
5632 int32_t ix = (int32_t)fp->fInputIdx;
5633 for (;;) {
5634 if (ix >= fActiveLimit) {
5635 fHitEnd = TRUE;
5636 break;
5637 }
5638 UChar32 c;
5639 U16_NEXT(inputBuf, ix, fActiveLimit, c);
5640 if (c<256) {
5641 if (s8->contains(c) == FALSE) {
5642 U16_BACK_1(inputBuf, 0, ix);
5643 break;
5644 }
5645 } else {
5646 if (s->contains(c) == FALSE) {
5647 U16_BACK_1(inputBuf, 0, ix);
5648 break;
5649 }
5650 }
5651 }
5652
5653 // If there were no matching characters, skip over the loop altogether.
5654 // The loop doesn't run at all, a * op always succeeds.
5655 if (ix == fp->fInputIdx) {
5656 fp->fPatIdx++; // skip the URX_LOOP_C op.
5657 break;
5658 }
5659
5660 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
5661 // must follow. It's operand is the stack location
5662 // that holds the starting input index for the match of this [set]*
5663 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
5664 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
5665 int32_t stackLoc = URX_VAL(loopcOp);
5666 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
5667 fp->fExtra[stackLoc] = fp->fInputIdx;
5668 fp->fInputIdx = ix;
5669
5670 // Save State to the URX_LOOP_C op that follows this one,
5671 // so that match failures in the following code will return to there.
5672 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
5673 fp = StateSave(fp, fp->fPatIdx, status);
5674 fp->fPatIdx++;
5675 }
5676 break;
5677
5678
5679 case URX_LOOP_DOT_I:
5680 // Loop Initialization for the optimized implementation of .*
5681 // This op scans through all remaining input.
5682 // The following LOOP_C op emulates stack unwinding if the following pattern fails.
5683 {
5684 // Loop through input until the input is exhausted (we reach an end-of-line)
5685 // In DOTALL mode, we can just go straight to the end of the input.
5686 int32_t ix;
5687 if ((opValue & 1) == 1) {
5688 // Dot-matches-All mode. Jump straight to the end of the string.
5689 ix = (int32_t)fActiveLimit;
5690 fHitEnd = TRUE;
5691 } else {
5692 // NOT DOT ALL mode. Line endings do not match '.'
5693 // Scan forward until a line ending or end of input.
5694 ix = (int32_t)fp->fInputIdx;
5695 for (;;) {
5696 if (ix >= fActiveLimit) {
5697 fHitEnd = TRUE;
5698 break;
5699 }
5700 UChar32 c;
5701 U16_NEXT(inputBuf, ix, fActiveLimit, c); // c = inputBuf[ix++]
5702 if ((c & 0x7f) <= 0x29) { // Fast filter of non-new-line-s
5703 if ((c == 0x0a) || // 0x0a is newline in both modes.
5704 (((opValue & 2) == 0) && // IF not UNIX_LINES mode
5705 isLineTerminator(c))) {
5706 // char is a line ending. Put the input pos back to the
5707 // line ending char, and exit the scanning loop.
5708 U16_BACK_1(inputBuf, 0, ix);
5709 break;
5710 }
5711 }
5712 }
5713 }
5714
5715 // If there were no matching characters, skip over the loop altogether.
5716 // The loop doesn't run at all, a * op always succeeds.
5717 if (ix == fp->fInputIdx) {
5718 fp->fPatIdx++; // skip the URX_LOOP_C op.
5719 break;
5720 }
5721
5722 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
5723 // must follow. It's operand is the stack location
5724 // that holds the starting input index for the match of this .*
5725 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
5726 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
5727 int32_t stackLoc = URX_VAL(loopcOp);
5728 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
5729 fp->fExtra[stackLoc] = fp->fInputIdx;
5730 fp->fInputIdx = ix;
5731
5732 // Save State to the URX_LOOP_C op that follows this one,
5733 // so that match failures in the following code will return to there.
5734 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
5735 fp = StateSave(fp, fp->fPatIdx, status);
5736 fp->fPatIdx++;
5737 }
5738 break;
5739
5740
5741 case URX_LOOP_C:
5742 {
5743 U_ASSERT(opValue>=0 && opValue<fFrameSize);
5744 backSearchIndex = (int32_t)fp->fExtra[opValue];
5745 U_ASSERT(backSearchIndex <= fp->fInputIdx);
5746 if (backSearchIndex == fp->fInputIdx) {
5747 // We've backed up the input idx to the point that the loop started.
5748 // The loop is done. Leave here without saving state.
5749 // Subsequent failures won't come back here.
5750 break;
5751 }
5752 // Set up for the next iteration of the loop, with input index
5753 // backed up by one from the last time through,
5754 // and a state save to this instruction in case the following code fails again.
5755 // (We're going backwards because this loop emulates stack unwinding, not
5756 // the initial scan forward.)
5757 U_ASSERT(fp->fInputIdx > 0);
5758 UChar32 prevC;
5759 U16_PREV(inputBuf, 0, fp->fInputIdx, prevC); // !!!: should this 0 be one of f*Limit?
5760
5761 if (prevC == 0x0a &&
5762 fp->fInputIdx > backSearchIndex &&
5763 inputBuf[fp->fInputIdx-1] == 0x0d) {
5764 int32_t prevOp = (int32_t)pat[fp->fPatIdx-2];
5765 if (URX_TYPE(prevOp) == URX_LOOP_DOT_I) {
5766 // .*, stepping back over CRLF pair.
5767 U16_BACK_1(inputBuf, 0, fp->fInputIdx);
5768 }
5769 }
5770
5771
5772 fp = StateSave(fp, fp->fPatIdx-1, status);
5773 }
5774 break;
5775
5776
5777
5778 default:
5779 // Trouble. The compiled pattern contains an entry with an
5780 // unrecognized type tag.
5781 U_ASSERT(FALSE);
5782 }
5783
5784 if (U_FAILURE(status)) {
5785 isMatch = FALSE;
5786 break;
5787 }
5788 }
5789
5790 breakFromLoop:
5791 fMatch = isMatch;
5792 if (isMatch) {
5793 fLastMatchEnd = fMatchEnd;
5794 fMatchStart = startIdx;
5795 fMatchEnd = fp->fInputIdx;
5796 }
5797
5798 #ifdef REGEX_RUN_DEBUG
5799 if (fTraceDebug) {
5800 if (isMatch) {
5801 printf("Match. start=%ld end=%ld\n\n", fMatchStart, fMatchEnd);
5802 } else {
5803 printf("No match\n\n");
5804 }
5805 }
5806 #endif
5807
5808 fFrame = fp; // The active stack frame when the engine stopped.
5809 // Contains the capture group results that we need to
5810 // access later.
5811
5812 return;
5813 }
5814
5815
5816 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RegexMatcher)
5817
5818 U_NAMESPACE_END
5819
5820 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS
5821
5822