1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 **************************************************************************
5 * Copyright (C) 2002-2016 International Business Machines Corporation
6 * and others. All rights reserved.
7 **************************************************************************
8 */
9 //
10 // file: rematch.cpp
11 //
12 // Contains the implementation of class RegexMatcher,
13 // which is one of the main API classes for the ICU regular expression package.
14 //
15
16 #include "unicode/utypes.h"
17 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
18
19 #include "unicode/regex.h"
20 #include "unicode/uniset.h"
21 #include "unicode/uchar.h"
22 #include "unicode/ustring.h"
23 #include "unicode/rbbi.h"
24 #include "unicode/utf.h"
25 #include "unicode/utf16.h"
26 #include "uassert.h"
27 #include "cmemory.h"
28 #include "cstr.h"
29 #include "uvector.h"
30 #include "uvectr32.h"
31 #include "uvectr64.h"
32 #include "regeximp.h"
33 #include "regexst.h"
34 #include "regextxt.h"
35 #include "ucase.h"
36
37 // #include <malloc.h> // Needed for heapcheck testing
38
39
40 U_NAMESPACE_BEGIN
41
42 // Default limit for the size of the back track stack, to avoid system
43 // failures causedby heap exhaustion. Units are in 32 bit words, not bytes.
44 // This value puts ICU's limits higher than most other regexp implementations,
45 // which use recursion rather than the heap, and take more storage per
46 // backtrack point.
47 //
48 static const int32_t DEFAULT_BACKTRACK_STACK_CAPACITY = 8000000;
49
50 // Time limit counter constant.
51 // Time limits for expression evaluation are in terms of quanta of work by
52 // the engine, each of which is 10,000 state saves.
53 // This constant determines that state saves per tick number.
54 static const int32_t TIMER_INITIAL_VALUE = 10000;
55
56
57 // Test for any of the Unicode line terminating characters.
isLineTerminator(UChar32 c)58 static inline UBool isLineTerminator(UChar32 c) {
59 if (c & ~(0x0a | 0x0b | 0x0c | 0x0d | 0x85 | 0x2028 | 0x2029)) {
60 return false;
61 }
62 return (c<=0x0d && c>=0x0a) || c==0x85 || c==0x2028 || c==0x2029;
63 }
64
65 //-----------------------------------------------------------------------------
66 //
67 // Constructor and Destructor
68 //
69 //-----------------------------------------------------------------------------
RegexMatcher(const RegexPattern * pat)70 RegexMatcher::RegexMatcher(const RegexPattern *pat) {
71 fDeferredStatus = U_ZERO_ERROR;
72 init(fDeferredStatus);
73 if (U_FAILURE(fDeferredStatus)) {
74 return;
75 }
76 if (pat==NULL) {
77 fDeferredStatus = U_ILLEGAL_ARGUMENT_ERROR;
78 return;
79 }
80 fPattern = pat;
81 init2(RegexStaticSets::gStaticSets->fEmptyText, fDeferredStatus);
82 }
83
84
85
RegexMatcher(const UnicodeString & regexp,const UnicodeString & input,uint32_t flags,UErrorCode & status)86 RegexMatcher::RegexMatcher(const UnicodeString ®exp, const UnicodeString &input,
87 uint32_t flags, UErrorCode &status) {
88 init(status);
89 if (U_FAILURE(status)) {
90 return;
91 }
92 UParseError pe;
93 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status);
94 fPattern = fPatternOwned;
95
96 UText inputText = UTEXT_INITIALIZER;
97 utext_openConstUnicodeString(&inputText, &input, &status);
98 init2(&inputText, status);
99 utext_close(&inputText);
100
101 fInputUniStrMaybeMutable = TRUE;
102 }
103
104
RegexMatcher(UText * regexp,UText * input,uint32_t flags,UErrorCode & status)105 RegexMatcher::RegexMatcher(UText *regexp, UText *input,
106 uint32_t flags, UErrorCode &status) {
107 init(status);
108 if (U_FAILURE(status)) {
109 return;
110 }
111 UParseError pe;
112 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status);
113 if (U_FAILURE(status)) {
114 return;
115 }
116
117 fPattern = fPatternOwned;
118 init2(input, status);
119 }
120
121
RegexMatcher(const UnicodeString & regexp,uint32_t flags,UErrorCode & status)122 RegexMatcher::RegexMatcher(const UnicodeString ®exp,
123 uint32_t flags, UErrorCode &status) {
124 init(status);
125 if (U_FAILURE(status)) {
126 return;
127 }
128 UParseError pe;
129 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status);
130 if (U_FAILURE(status)) {
131 return;
132 }
133 fPattern = fPatternOwned;
134 init2(RegexStaticSets::gStaticSets->fEmptyText, status);
135 }
136
RegexMatcher(UText * regexp,uint32_t flags,UErrorCode & status)137 RegexMatcher::RegexMatcher(UText *regexp,
138 uint32_t flags, UErrorCode &status) {
139 init(status);
140 if (U_FAILURE(status)) {
141 return;
142 }
143 UParseError pe;
144 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status);
145 if (U_FAILURE(status)) {
146 return;
147 }
148
149 fPattern = fPatternOwned;
150 init2(RegexStaticSets::gStaticSets->fEmptyText, status);
151 }
152
153
154
155
~RegexMatcher()156 RegexMatcher::~RegexMatcher() {
157 delete fStack;
158 if (fData != fSmallData) {
159 uprv_free(fData);
160 fData = NULL;
161 }
162 if (fPatternOwned) {
163 delete fPatternOwned;
164 fPatternOwned = NULL;
165 fPattern = NULL;
166 }
167
168 if (fInput) {
169 delete fInput;
170 }
171 if (fInputText) {
172 utext_close(fInputText);
173 }
174 if (fAltInputText) {
175 utext_close(fAltInputText);
176 }
177
178 #if UCONFIG_NO_BREAK_ITERATION==0
179 delete fWordBreakItr;
180 #endif
181 }
182
183 //
184 // init() common initialization for use by all constructors.
185 // Initialize all fields, get the object into a consistent state.
186 // This must be done even when the initial status shows an error,
187 // so that the object is initialized sufficiently well for the destructor
188 // to run safely.
189 //
init(UErrorCode & status)190 void RegexMatcher::init(UErrorCode &status) {
191 fPattern = NULL;
192 fPatternOwned = NULL;
193 fFrameSize = 0;
194 fRegionStart = 0;
195 fRegionLimit = 0;
196 fAnchorStart = 0;
197 fAnchorLimit = 0;
198 fLookStart = 0;
199 fLookLimit = 0;
200 fActiveStart = 0;
201 fActiveLimit = 0;
202 fTransparentBounds = FALSE;
203 fAnchoringBounds = TRUE;
204 fMatch = FALSE;
205 fMatchStart = 0;
206 fMatchEnd = 0;
207 fLastMatchEnd = -1;
208 fAppendPosition = 0;
209 fHitEnd = FALSE;
210 fRequireEnd = FALSE;
211 fStack = NULL;
212 fFrame = NULL;
213 fTimeLimit = 0;
214 fTime = 0;
215 fTickCounter = 0;
216 fStackLimit = DEFAULT_BACKTRACK_STACK_CAPACITY;
217 fCallbackFn = NULL;
218 fCallbackContext = NULL;
219 fFindProgressCallbackFn = NULL;
220 fFindProgressCallbackContext = NULL;
221 fTraceDebug = FALSE;
222 fDeferredStatus = status;
223 fData = fSmallData;
224 fWordBreakItr = NULL;
225
226 fStack = NULL;
227 fInputText = NULL;
228 fAltInputText = NULL;
229 fInput = NULL;
230 fInputLength = 0;
231 fInputUniStrMaybeMutable = FALSE;
232 }
233
234 //
235 // init2() Common initialization for use by RegexMatcher constructors, part 2.
236 // This handles the common setup to be done after the Pattern is available.
237 //
init2(UText * input,UErrorCode & status)238 void RegexMatcher::init2(UText *input, UErrorCode &status) {
239 if (U_FAILURE(status)) {
240 fDeferredStatus = status;
241 return;
242 }
243
244 if (fPattern->fDataSize > UPRV_LENGTHOF(fSmallData)) {
245 fData = (int64_t *)uprv_malloc(fPattern->fDataSize * sizeof(int64_t));
246 if (fData == NULL) {
247 status = fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
248 return;
249 }
250 }
251
252 fStack = new UVector64(status);
253 if (fStack == NULL) {
254 status = fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
255 return;
256 }
257
258 reset(input);
259 setStackLimit(DEFAULT_BACKTRACK_STACK_CAPACITY, status);
260 if (U_FAILURE(status)) {
261 fDeferredStatus = status;
262 return;
263 }
264 }
265
266
267 static const UChar BACKSLASH = 0x5c;
268 static const UChar DOLLARSIGN = 0x24;
269 static const UChar LEFTBRACKET = 0x7b;
270 static const UChar RIGHTBRACKET = 0x7d;
271
272 //--------------------------------------------------------------------------------
273 //
274 // appendReplacement
275 //
276 //--------------------------------------------------------------------------------
appendReplacement(UnicodeString & dest,const UnicodeString & replacement,UErrorCode & status)277 RegexMatcher &RegexMatcher::appendReplacement(UnicodeString &dest,
278 const UnicodeString &replacement,
279 UErrorCode &status) {
280 UText replacementText = UTEXT_INITIALIZER;
281
282 utext_openConstUnicodeString(&replacementText, &replacement, &status);
283 if (U_SUCCESS(status)) {
284 UText resultText = UTEXT_INITIALIZER;
285 utext_openUnicodeString(&resultText, &dest, &status);
286
287 if (U_SUCCESS(status)) {
288 appendReplacement(&resultText, &replacementText, status);
289 utext_close(&resultText);
290 }
291 utext_close(&replacementText);
292 }
293
294 return *this;
295 }
296
297 //
298 // appendReplacement, UText mode
299 //
appendReplacement(UText * dest,UText * replacement,UErrorCode & status)300 RegexMatcher &RegexMatcher::appendReplacement(UText *dest,
301 UText *replacement,
302 UErrorCode &status) {
303 if (U_FAILURE(status)) {
304 return *this;
305 }
306 if (U_FAILURE(fDeferredStatus)) {
307 status = fDeferredStatus;
308 return *this;
309 }
310 if (fMatch == FALSE) {
311 status = U_REGEX_INVALID_STATE;
312 return *this;
313 }
314
315 // Copy input string from the end of previous match to start of current match
316 int64_t destLen = utext_nativeLength(dest);
317 if (fMatchStart > fAppendPosition) {
318 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
319 destLen += utext_replace(dest, destLen, destLen, fInputText->chunkContents+fAppendPosition,
320 (int32_t)(fMatchStart-fAppendPosition), &status);
321 } else {
322 int32_t len16;
323 if (UTEXT_USES_U16(fInputText)) {
324 len16 = (int32_t)(fMatchStart-fAppendPosition);
325 } else {
326 UErrorCode lengthStatus = U_ZERO_ERROR;
327 len16 = utext_extract(fInputText, fAppendPosition, fMatchStart, NULL, 0, &lengthStatus);
328 }
329 UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1));
330 if (inputChars == NULL) {
331 status = U_MEMORY_ALLOCATION_ERROR;
332 return *this;
333 }
334 utext_extract(fInputText, fAppendPosition, fMatchStart, inputChars, len16+1, &status);
335 destLen += utext_replace(dest, destLen, destLen, inputChars, len16, &status);
336 uprv_free(inputChars);
337 }
338 }
339 fAppendPosition = fMatchEnd;
340
341
342 // scan the replacement text, looking for substitutions ($n) and \escapes.
343 // TODO: optimize this loop by efficiently scanning for '$' or '\',
344 // move entire ranges not containing substitutions.
345 UTEXT_SETNATIVEINDEX(replacement, 0);
346 for (UChar32 c = UTEXT_NEXT32(replacement); U_SUCCESS(status) && c != U_SENTINEL; c = UTEXT_NEXT32(replacement)) {
347 if (c == BACKSLASH) {
348 // Backslash Escape. Copy the following char out without further checks.
349 // Note: Surrogate pairs don't need any special handling
350 // The second half wont be a '$' or a '\', and
351 // will move to the dest normally on the next
352 // loop iteration.
353 c = UTEXT_CURRENT32(replacement);
354 if (c == U_SENTINEL) {
355 break;
356 }
357
358 if (c==0x55/*U*/ || c==0x75/*u*/) {
359 // We have a \udddd or \Udddddddd escape sequence.
360 int32_t offset = 0;
361 struct URegexUTextUnescapeCharContext context = U_REGEX_UTEXT_UNESCAPE_CONTEXT(replacement);
362 UChar32 escapedChar = u_unescapeAt(uregex_utext_unescape_charAt, &offset, INT32_MAX, &context);
363 if (escapedChar != (UChar32)0xFFFFFFFF) {
364 if (U_IS_BMP(escapedChar)) {
365 UChar c16 = (UChar)escapedChar;
366 destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
367 } else {
368 UChar surrogate[2];
369 surrogate[0] = U16_LEAD(escapedChar);
370 surrogate[1] = U16_TRAIL(escapedChar);
371 if (U_SUCCESS(status)) {
372 destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);
373 }
374 }
375 // TODO: Report errors for mal-formed \u escapes?
376 // As this is, the original sequence is output, which may be OK.
377 if (context.lastOffset == offset) {
378 (void)UTEXT_PREVIOUS32(replacement);
379 } else if (context.lastOffset != offset-1) {
380 utext_moveIndex32(replacement, offset - context.lastOffset - 1);
381 }
382 }
383 } else {
384 (void)UTEXT_NEXT32(replacement);
385 // Plain backslash escape. Just put out the escaped character.
386 if (U_IS_BMP(c)) {
387 UChar c16 = (UChar)c;
388 destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
389 } else {
390 UChar surrogate[2];
391 surrogate[0] = U16_LEAD(c);
392 surrogate[1] = U16_TRAIL(c);
393 if (U_SUCCESS(status)) {
394 destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);
395 }
396 }
397 }
398 } else if (c != DOLLARSIGN) {
399 // Normal char, not a $. Copy it out without further checks.
400 if (U_IS_BMP(c)) {
401 UChar c16 = (UChar)c;
402 destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
403 } else {
404 UChar surrogate[2];
405 surrogate[0] = U16_LEAD(c);
406 surrogate[1] = U16_TRAIL(c);
407 if (U_SUCCESS(status)) {
408 destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);
409 }
410 }
411 } else {
412 // We've got a $. Pick up a capture group name or number if one follows.
413 // Consume digits so long as the resulting group number <= the number of
414 // number of capture groups in the pattern.
415
416 int32_t groupNum = 0;
417 int32_t numDigits = 0;
418 UChar32 nextChar = utext_current32(replacement);
419 if (nextChar == LEFTBRACKET) {
420 // Scan for a Named Capture Group, ${name}.
421 UnicodeString groupName;
422 utext_next32(replacement);
423 while(U_SUCCESS(status) && nextChar != RIGHTBRACKET) {
424 nextChar = utext_next32(replacement);
425 if (nextChar == U_SENTINEL) {
426 status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
427 } else if ((nextChar >= 0x41 && nextChar <= 0x5a) || // A..Z
428 (nextChar >= 0x61 && nextChar <= 0x7a) || // a..z
429 (nextChar >= 0x31 && nextChar <= 0x39)) { // 0..9
430 groupName.append(nextChar);
431 } else if (nextChar == RIGHTBRACKET) {
432 groupNum = uhash_geti(fPattern->fNamedCaptureMap, &groupName);
433 if (groupNum == 0) {
434 status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
435 }
436 } else {
437 // Character was something other than a name char or a closing '}'
438 status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
439 }
440 }
441
442 } else if (u_isdigit(nextChar)) {
443 // $n Scan for a capture group number
444 int32_t numCaptureGroups = fPattern->fGroupMap->size();
445 for (;;) {
446 nextChar = UTEXT_CURRENT32(replacement);
447 if (nextChar == U_SENTINEL) {
448 break;
449 }
450 if (u_isdigit(nextChar) == FALSE) {
451 break;
452 }
453 int32_t nextDigitVal = u_charDigitValue(nextChar);
454 if (groupNum*10 + nextDigitVal > numCaptureGroups) {
455 // Don't consume the next digit if it makes the capture group number too big.
456 if (numDigits == 0) {
457 status = U_INDEX_OUTOFBOUNDS_ERROR;
458 }
459 break;
460 }
461 (void)UTEXT_NEXT32(replacement);
462 groupNum=groupNum*10 + nextDigitVal;
463 ++numDigits;
464 }
465 } else {
466 // $ not followed by capture group name or number.
467 status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
468 }
469
470 if (U_SUCCESS(status)) {
471 destLen += appendGroup(groupNum, dest, status);
472 }
473 } // End of $ capture group handling
474 } // End of per-character loop through the replacement string.
475
476 return *this;
477 }
478
479
480
481 //--------------------------------------------------------------------------------
482 //
483 // appendTail Intended to be used in conjunction with appendReplacement()
484 // To the destination string, append everything following
485 // the last match position from the input string.
486 //
487 // Note: Match ranges do not affect appendTail or appendReplacement
488 //
489 //--------------------------------------------------------------------------------
appendTail(UnicodeString & dest)490 UnicodeString &RegexMatcher::appendTail(UnicodeString &dest) {
491 UErrorCode status = U_ZERO_ERROR;
492 UText resultText = UTEXT_INITIALIZER;
493 utext_openUnicodeString(&resultText, &dest, &status);
494
495 if (U_SUCCESS(status)) {
496 appendTail(&resultText, status);
497 utext_close(&resultText);
498 }
499
500 return dest;
501 }
502
503 //
504 // appendTail, UText mode
505 //
appendTail(UText * dest,UErrorCode & status)506 UText *RegexMatcher::appendTail(UText *dest, UErrorCode &status) {
507 if (U_FAILURE(status)) {
508 return dest;
509 }
510 if (U_FAILURE(fDeferredStatus)) {
511 status = fDeferredStatus;
512 return dest;
513 }
514
515 if (fInputLength > fAppendPosition) {
516 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
517 int64_t destLen = utext_nativeLength(dest);
518 utext_replace(dest, destLen, destLen, fInputText->chunkContents+fAppendPosition,
519 (int32_t)(fInputLength-fAppendPosition), &status);
520 } else {
521 int32_t len16;
522 if (UTEXT_USES_U16(fInputText)) {
523 len16 = (int32_t)(fInputLength-fAppendPosition);
524 } else {
525 len16 = utext_extract(fInputText, fAppendPosition, fInputLength, NULL, 0, &status);
526 status = U_ZERO_ERROR; // buffer overflow
527 }
528
529 UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16));
530 if (inputChars == NULL) {
531 fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
532 } else {
533 utext_extract(fInputText, fAppendPosition, fInputLength, inputChars, len16, &status); // unterminated
534 int64_t destLen = utext_nativeLength(dest);
535 utext_replace(dest, destLen, destLen, inputChars, len16, &status);
536 uprv_free(inputChars);
537 }
538 }
539 }
540 return dest;
541 }
542
543
544
545 //--------------------------------------------------------------------------------
546 //
547 // end
548 //
549 //--------------------------------------------------------------------------------
end(UErrorCode & err) const550 int32_t RegexMatcher::end(UErrorCode &err) const {
551 return end(0, err);
552 }
553
end64(UErrorCode & err) const554 int64_t RegexMatcher::end64(UErrorCode &err) const {
555 return end64(0, err);
556 }
557
end64(int32_t group,UErrorCode & err) const558 int64_t RegexMatcher::end64(int32_t group, UErrorCode &err) const {
559 if (U_FAILURE(err)) {
560 return -1;
561 }
562 if (fMatch == FALSE) {
563 err = U_REGEX_INVALID_STATE;
564 return -1;
565 }
566 if (group < 0 || group > fPattern->fGroupMap->size()) {
567 err = U_INDEX_OUTOFBOUNDS_ERROR;
568 return -1;
569 }
570 int64_t e = -1;
571 if (group == 0) {
572 e = fMatchEnd;
573 } else {
574 // Get the position within the stack frame of the variables for
575 // this capture group.
576 int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1);
577 U_ASSERT(groupOffset < fPattern->fFrameSize);
578 U_ASSERT(groupOffset >= 0);
579 e = fFrame->fExtra[groupOffset + 1];
580 }
581
582 return e;
583 }
584
end(int32_t group,UErrorCode & err) const585 int32_t RegexMatcher::end(int32_t group, UErrorCode &err) const {
586 return (int32_t)end64(group, err);
587 }
588
589 //--------------------------------------------------------------------------------
590 //
591 // findProgressInterrupt This function is called once for each advance in the target
592 // string from the find() function, and calls the user progress callback
593 // function if there is one installed.
594 //
595 // Return: TRUE if the find operation is to be terminated.
596 // FALSE if the find operation is to continue running.
597 //
598 //--------------------------------------------------------------------------------
findProgressInterrupt(int64_t pos,UErrorCode & status)599 UBool RegexMatcher::findProgressInterrupt(int64_t pos, UErrorCode &status) {
600 if (fFindProgressCallbackFn && !(*fFindProgressCallbackFn)(fFindProgressCallbackContext, pos)) {
601 status = U_REGEX_STOPPED_BY_CALLER;
602 return TRUE;
603 }
604 return FALSE;
605 }
606
607 //--------------------------------------------------------------------------------
608 //
609 // find()
610 //
611 //--------------------------------------------------------------------------------
find()612 UBool RegexMatcher::find() {
613 if (U_FAILURE(fDeferredStatus)) {
614 return FALSE;
615 }
616 UErrorCode status = U_ZERO_ERROR;
617 UBool result = find(status);
618 return result;
619 }
620
621 //--------------------------------------------------------------------------------
622 //
623 // find()
624 //
625 //--------------------------------------------------------------------------------
find(UErrorCode & status)626 UBool RegexMatcher::find(UErrorCode &status) {
627 // Start at the position of the last match end. (Will be zero if the
628 // matcher has been reset.)
629 //
630 if (U_FAILURE(status)) {
631 return FALSE;
632 }
633 if (U_FAILURE(fDeferredStatus)) {
634 status = fDeferredStatus;
635 return FALSE;
636 }
637
638 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
639 return findUsingChunk(status);
640 }
641
642 int64_t startPos = fMatchEnd;
643 if (startPos==0) {
644 startPos = fActiveStart;
645 }
646
647 if (fMatch) {
648 // Save the position of any previous successful match.
649 fLastMatchEnd = fMatchEnd;
650
651 if (fMatchStart == fMatchEnd) {
652 // Previous match had zero length. Move start position up one position
653 // to avoid sending find() into a loop on zero-length matches.
654 if (startPos >= fActiveLimit) {
655 fMatch = FALSE;
656 fHitEnd = TRUE;
657 return FALSE;
658 }
659 UTEXT_SETNATIVEINDEX(fInputText, startPos);
660 (void)UTEXT_NEXT32(fInputText);
661 startPos = UTEXT_GETNATIVEINDEX(fInputText);
662 }
663 } else {
664 if (fLastMatchEnd >= 0) {
665 // A previous find() failed to match. Don't try again.
666 // (without this test, a pattern with a zero-length match
667 // could match again at the end of an input string.)
668 fHitEnd = TRUE;
669 return FALSE;
670 }
671 }
672
673
674 // Compute the position in the input string beyond which a match can not begin, because
675 // the minimum length match would extend past the end of the input.
676 // Note: some patterns that cannot match anything will have fMinMatchLength==Max Int.
677 // Be aware of possible overflows if making changes here.
678 int64_t testStartLimit;
679 if (UTEXT_USES_U16(fInputText)) {
680 testStartLimit = fActiveLimit - fPattern->fMinMatchLen;
681 if (startPos > testStartLimit) {
682 fMatch = FALSE;
683 fHitEnd = TRUE;
684 return FALSE;
685 }
686 } else {
687 // We don't know exactly how long the minimum match length is in native characters.
688 // Treat anything > 0 as 1.
689 testStartLimit = fActiveLimit - (fPattern->fMinMatchLen > 0 ? 1 : 0);
690 }
691
692 UChar32 c;
693 U_ASSERT(startPos >= 0);
694
695 switch (fPattern->fStartType) {
696 case START_NO_INFO:
697 // No optimization was found.
698 // Try a match at each input position.
699 for (;;) {
700 MatchAt(startPos, FALSE, status);
701 if (U_FAILURE(status)) {
702 return FALSE;
703 }
704 if (fMatch) {
705 return TRUE;
706 }
707 if (startPos >= testStartLimit) {
708 fHitEnd = TRUE;
709 return FALSE;
710 }
711 UTEXT_SETNATIVEINDEX(fInputText, startPos);
712 (void)UTEXT_NEXT32(fInputText);
713 startPos = UTEXT_GETNATIVEINDEX(fInputText);
714 // Note that it's perfectly OK for a pattern to have a zero-length
715 // match at the end of a string, so we must make sure that the loop
716 // runs with startPos == testStartLimit the last time through.
717 if (findProgressInterrupt(startPos, status))
718 return FALSE;
719 }
720 U_ASSERT(FALSE);
721
722 case START_START:
723 // Matches are only possible at the start of the input string
724 // (pattern begins with ^ or \A)
725 if (startPos > fActiveStart) {
726 fMatch = FALSE;
727 return FALSE;
728 }
729 MatchAt(startPos, FALSE, status);
730 if (U_FAILURE(status)) {
731 return FALSE;
732 }
733 return fMatch;
734
735
736 case START_SET:
737 {
738 // Match may start on any char from a pre-computed set.
739 U_ASSERT(fPattern->fMinMatchLen > 0);
740 UTEXT_SETNATIVEINDEX(fInputText, startPos);
741 for (;;) {
742 int64_t pos = startPos;
743 c = UTEXT_NEXT32(fInputText);
744 startPos = UTEXT_GETNATIVEINDEX(fInputText);
745 // c will be -1 (U_SENTINEL) at end of text, in which case we
746 // skip this next block (so we don't have a negative array index)
747 // and handle end of text in the following block.
748 if (c >= 0 && ((c<256 && fPattern->fInitialChars8->contains(c)) ||
749 (c>=256 && fPattern->fInitialChars->contains(c)))) {
750 MatchAt(pos, FALSE, status);
751 if (U_FAILURE(status)) {
752 return FALSE;
753 }
754 if (fMatch) {
755 return TRUE;
756 }
757 UTEXT_SETNATIVEINDEX(fInputText, pos);
758 }
759 if (startPos > testStartLimit) {
760 fMatch = FALSE;
761 fHitEnd = TRUE;
762 return FALSE;
763 }
764 if (findProgressInterrupt(startPos, status))
765 return FALSE;
766 }
767 }
768 U_ASSERT(FALSE);
769
770 case START_STRING:
771 case START_CHAR:
772 {
773 // Match starts on exactly one char.
774 U_ASSERT(fPattern->fMinMatchLen > 0);
775 UChar32 theChar = fPattern->fInitialChar;
776 UTEXT_SETNATIVEINDEX(fInputText, startPos);
777 for (;;) {
778 int64_t pos = startPos;
779 c = UTEXT_NEXT32(fInputText);
780 startPos = UTEXT_GETNATIVEINDEX(fInputText);
781 if (c == theChar) {
782 MatchAt(pos, FALSE, status);
783 if (U_FAILURE(status)) {
784 return FALSE;
785 }
786 if (fMatch) {
787 return TRUE;
788 }
789 UTEXT_SETNATIVEINDEX(fInputText, startPos);
790 }
791 if (startPos > testStartLimit) {
792 fMatch = FALSE;
793 fHitEnd = TRUE;
794 return FALSE;
795 }
796 if (findProgressInterrupt(startPos, status))
797 return FALSE;
798 }
799 }
800 U_ASSERT(FALSE);
801
802 case START_LINE:
803 {
804 UChar32 c;
805 if (startPos == fAnchorStart) {
806 MatchAt(startPos, FALSE, status);
807 if (U_FAILURE(status)) {
808 return FALSE;
809 }
810 if (fMatch) {
811 return TRUE;
812 }
813 UTEXT_SETNATIVEINDEX(fInputText, startPos);
814 c = UTEXT_NEXT32(fInputText);
815 startPos = UTEXT_GETNATIVEINDEX(fInputText);
816 } else {
817 UTEXT_SETNATIVEINDEX(fInputText, startPos);
818 c = UTEXT_PREVIOUS32(fInputText);
819 UTEXT_SETNATIVEINDEX(fInputText, startPos);
820 }
821
822 if (fPattern->fFlags & UREGEX_UNIX_LINES) {
823 for (;;) {
824 if (c == 0x0a) {
825 MatchAt(startPos, FALSE, status);
826 if (U_FAILURE(status)) {
827 return FALSE;
828 }
829 if (fMatch) {
830 return TRUE;
831 }
832 UTEXT_SETNATIVEINDEX(fInputText, startPos);
833 }
834 if (startPos >= testStartLimit) {
835 fMatch = FALSE;
836 fHitEnd = TRUE;
837 return FALSE;
838 }
839 c = UTEXT_NEXT32(fInputText);
840 startPos = UTEXT_GETNATIVEINDEX(fInputText);
841 // Note that it's perfectly OK for a pattern to have a zero-length
842 // match at the end of a string, so we must make sure that the loop
843 // runs with startPos == testStartLimit the last time through.
844 if (findProgressInterrupt(startPos, status))
845 return FALSE;
846 }
847 } else {
848 for (;;) {
849 if (isLineTerminator(c)) {
850 if (c == 0x0d && startPos < fActiveLimit && UTEXT_CURRENT32(fInputText) == 0x0a) {
851 (void)UTEXT_NEXT32(fInputText);
852 startPos = UTEXT_GETNATIVEINDEX(fInputText);
853 }
854 MatchAt(startPos, FALSE, status);
855 if (U_FAILURE(status)) {
856 return FALSE;
857 }
858 if (fMatch) {
859 return TRUE;
860 }
861 UTEXT_SETNATIVEINDEX(fInputText, startPos);
862 }
863 if (startPos >= testStartLimit) {
864 fMatch = FALSE;
865 fHitEnd = TRUE;
866 return FALSE;
867 }
868 c = UTEXT_NEXT32(fInputText);
869 startPos = UTEXT_GETNATIVEINDEX(fInputText);
870 // Note that it's perfectly OK for a pattern to have a zero-length
871 // match at the end of a string, so we must make sure that the loop
872 // runs with startPos == testStartLimit the last time through.
873 if (findProgressInterrupt(startPos, status))
874 return FALSE;
875 }
876 }
877 }
878
879 default:
880 U_ASSERT(FALSE);
881 }
882
883 U_ASSERT(FALSE);
884 return FALSE;
885 }
886
887
888
find(int64_t start,UErrorCode & status)889 UBool RegexMatcher::find(int64_t start, UErrorCode &status) {
890 if (U_FAILURE(status)) {
891 return FALSE;
892 }
893 if (U_FAILURE(fDeferredStatus)) {
894 status = fDeferredStatus;
895 return FALSE;
896 }
897 this->reset(); // Note: Reset() is specified by Java Matcher documentation.
898 // This will reset the region to be the full input length.
899 if (start < 0) {
900 status = U_INDEX_OUTOFBOUNDS_ERROR;
901 return FALSE;
902 }
903
904 int64_t nativeStart = start;
905 if (nativeStart < fActiveStart || nativeStart > fActiveLimit) {
906 status = U_INDEX_OUTOFBOUNDS_ERROR;
907 return FALSE;
908 }
909 fMatchEnd = nativeStart;
910 return find(status);
911 }
912
913
914 //--------------------------------------------------------------------------------
915 //
916 // findUsingChunk() -- like find(), but with the advance knowledge that the
917 // entire string is available in the UText's chunk buffer.
918 //
919 //--------------------------------------------------------------------------------
findUsingChunk(UErrorCode & status)920 UBool RegexMatcher::findUsingChunk(UErrorCode &status) {
921 // Start at the position of the last match end. (Will be zero if the
922 // matcher has been reset.
923 //
924
925 int32_t startPos = (int32_t)fMatchEnd;
926 if (startPos==0) {
927 startPos = (int32_t)fActiveStart;
928 }
929
930 const UChar *inputBuf = fInputText->chunkContents;
931
932 if (fMatch) {
933 // Save the position of any previous successful match.
934 fLastMatchEnd = fMatchEnd;
935
936 if (fMatchStart == fMatchEnd) {
937 // Previous match had zero length. Move start position up one position
938 // to avoid sending find() into a loop on zero-length matches.
939 if (startPos >= fActiveLimit) {
940 fMatch = FALSE;
941 fHitEnd = TRUE;
942 return FALSE;
943 }
944 U16_FWD_1(inputBuf, startPos, fInputLength);
945 }
946 } else {
947 if (fLastMatchEnd >= 0) {
948 // A previous find() failed to match. Don't try again.
949 // (without this test, a pattern with a zero-length match
950 // could match again at the end of an input string.)
951 fHitEnd = TRUE;
952 return FALSE;
953 }
954 }
955
956
957 // Compute the position in the input string beyond which a match can not begin, because
958 // the minimum length match would extend past the end of the input.
959 // Note: some patterns that cannot match anything will have fMinMatchLength==Max Int.
960 // Be aware of possible overflows if making changes here.
961 // Note: a match can begin at inputBuf + testLen; it is an inclusive limit.
962 int32_t testLen = (int32_t)(fActiveLimit - fPattern->fMinMatchLen);
963 if (startPos > testLen) {
964 fMatch = FALSE;
965 fHitEnd = TRUE;
966 return FALSE;
967 }
968
969 UChar32 c;
970 U_ASSERT(startPos >= 0);
971
972 switch (fPattern->fStartType) {
973 case START_NO_INFO:
974 // No optimization was found.
975 // Try a match at each input position.
976 for (;;) {
977 MatchChunkAt(startPos, FALSE, status);
978 if (U_FAILURE(status)) {
979 return FALSE;
980 }
981 if (fMatch) {
982 return TRUE;
983 }
984 if (startPos >= testLen) {
985 fHitEnd = TRUE;
986 return FALSE;
987 }
988 U16_FWD_1(inputBuf, startPos, fActiveLimit);
989 // Note that it's perfectly OK for a pattern to have a zero-length
990 // match at the end of a string, so we must make sure that the loop
991 // runs with startPos == testLen the last time through.
992 if (findProgressInterrupt(startPos, status))
993 return FALSE;
994 }
995 U_ASSERT(FALSE);
996
997 case START_START:
998 // Matches are only possible at the start of the input string
999 // (pattern begins with ^ or \A)
1000 if (startPos > fActiveStart) {
1001 fMatch = FALSE;
1002 return FALSE;
1003 }
1004 MatchChunkAt(startPos, FALSE, status);
1005 if (U_FAILURE(status)) {
1006 return FALSE;
1007 }
1008 return fMatch;
1009
1010
1011 case START_SET:
1012 {
1013 // Match may start on any char from a pre-computed set.
1014 U_ASSERT(fPattern->fMinMatchLen > 0);
1015 for (;;) {
1016 int32_t pos = startPos;
1017 U16_NEXT(inputBuf, startPos, fActiveLimit, c); // like c = inputBuf[startPos++];
1018 if ((c<256 && fPattern->fInitialChars8->contains(c)) ||
1019 (c>=256 && fPattern->fInitialChars->contains(c))) {
1020 MatchChunkAt(pos, FALSE, status);
1021 if (U_FAILURE(status)) {
1022 return FALSE;
1023 }
1024 if (fMatch) {
1025 return TRUE;
1026 }
1027 }
1028 if (startPos > testLen) {
1029 fMatch = FALSE;
1030 fHitEnd = TRUE;
1031 return FALSE;
1032 }
1033 if (findProgressInterrupt(startPos, status))
1034 return FALSE;
1035 }
1036 }
1037 U_ASSERT(FALSE);
1038
1039 case START_STRING:
1040 case START_CHAR:
1041 {
1042 // Match starts on exactly one char.
1043 U_ASSERT(fPattern->fMinMatchLen > 0);
1044 UChar32 theChar = fPattern->fInitialChar;
1045 for (;;) {
1046 int32_t pos = startPos;
1047 U16_NEXT(inputBuf, startPos, fActiveLimit, c); // like c = inputBuf[startPos++];
1048 if (c == theChar) {
1049 MatchChunkAt(pos, FALSE, status);
1050 if (U_FAILURE(status)) {
1051 return FALSE;
1052 }
1053 if (fMatch) {
1054 return TRUE;
1055 }
1056 }
1057 if (startPos > testLen) {
1058 fMatch = FALSE;
1059 fHitEnd = TRUE;
1060 return FALSE;
1061 }
1062 if (findProgressInterrupt(startPos, status))
1063 return FALSE;
1064 }
1065 }
1066 U_ASSERT(FALSE);
1067
1068 case START_LINE:
1069 {
1070 UChar32 c;
1071 if (startPos == fAnchorStart) {
1072 MatchChunkAt(startPos, FALSE, status);
1073 if (U_FAILURE(status)) {
1074 return FALSE;
1075 }
1076 if (fMatch) {
1077 return TRUE;
1078 }
1079 U16_FWD_1(inputBuf, startPos, fActiveLimit);
1080 }
1081
1082 if (fPattern->fFlags & UREGEX_UNIX_LINES) {
1083 for (;;) {
1084 c = inputBuf[startPos-1];
1085 if (c == 0x0a) {
1086 MatchChunkAt(startPos, FALSE, status);
1087 if (U_FAILURE(status)) {
1088 return FALSE;
1089 }
1090 if (fMatch) {
1091 return TRUE;
1092 }
1093 }
1094 if (startPos >= testLen) {
1095 fMatch = FALSE;
1096 fHitEnd = TRUE;
1097 return FALSE;
1098 }
1099 U16_FWD_1(inputBuf, startPos, fActiveLimit);
1100 // Note that it's perfectly OK for a pattern to have a zero-length
1101 // match at the end of a string, so we must make sure that the loop
1102 // runs with startPos == testLen the last time through.
1103 if (findProgressInterrupt(startPos, status))
1104 return FALSE;
1105 }
1106 } else {
1107 for (;;) {
1108 c = inputBuf[startPos-1];
1109 if (isLineTerminator(c)) {
1110 if (c == 0x0d && startPos < fActiveLimit && inputBuf[startPos] == 0x0a) {
1111 startPos++;
1112 }
1113 MatchChunkAt(startPos, FALSE, status);
1114 if (U_FAILURE(status)) {
1115 return FALSE;
1116 }
1117 if (fMatch) {
1118 return TRUE;
1119 }
1120 }
1121 if (startPos >= testLen) {
1122 fMatch = FALSE;
1123 fHitEnd = TRUE;
1124 return FALSE;
1125 }
1126 U16_FWD_1(inputBuf, startPos, fActiveLimit);
1127 // Note that it's perfectly OK for a pattern to have a zero-length
1128 // match at the end of a string, so we must make sure that the loop
1129 // runs with startPos == testLen the last time through.
1130 if (findProgressInterrupt(startPos, status))
1131 return FALSE;
1132 }
1133 }
1134 }
1135
1136 default:
1137 U_ASSERT(FALSE);
1138 }
1139
1140 U_ASSERT(FALSE);
1141 return FALSE;
1142 }
1143
1144
1145
1146 //--------------------------------------------------------------------------------
1147 //
1148 // group()
1149 //
1150 //--------------------------------------------------------------------------------
group(UErrorCode & status) const1151 UnicodeString RegexMatcher::group(UErrorCode &status) const {
1152 return group(0, status);
1153 }
1154
1155 // Return immutable shallow clone
group(UText * dest,int64_t & group_len,UErrorCode & status) const1156 UText *RegexMatcher::group(UText *dest, int64_t &group_len, UErrorCode &status) const {
1157 return group(0, dest, group_len, status);
1158 }
1159
1160 // Return immutable shallow clone
group(int32_t groupNum,UText * dest,int64_t & group_len,UErrorCode & status) const1161 UText *RegexMatcher::group(int32_t groupNum, UText *dest, int64_t &group_len, UErrorCode &status) const {
1162 group_len = 0;
1163 if (U_FAILURE(status)) {
1164 return dest;
1165 }
1166 if (U_FAILURE(fDeferredStatus)) {
1167 status = fDeferredStatus;
1168 } else if (fMatch == FALSE) {
1169 status = U_REGEX_INVALID_STATE;
1170 } else if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) {
1171 status = U_INDEX_OUTOFBOUNDS_ERROR;
1172 }
1173
1174 if (U_FAILURE(status)) {
1175 return dest;
1176 }
1177
1178 int64_t s, e;
1179 if (groupNum == 0) {
1180 s = fMatchStart;
1181 e = fMatchEnd;
1182 } else {
1183 int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1);
1184 U_ASSERT(groupOffset < fPattern->fFrameSize);
1185 U_ASSERT(groupOffset >= 0);
1186 s = fFrame->fExtra[groupOffset];
1187 e = fFrame->fExtra[groupOffset+1];
1188 }
1189
1190 if (s < 0) {
1191 // A capture group wasn't part of the match
1192 return utext_clone(dest, fInputText, FALSE, TRUE, &status);
1193 }
1194 U_ASSERT(s <= e);
1195 group_len = e - s;
1196
1197 dest = utext_clone(dest, fInputText, FALSE, TRUE, &status);
1198 if (dest)
1199 UTEXT_SETNATIVEINDEX(dest, s);
1200 return dest;
1201 }
1202
group(int32_t groupNum,UErrorCode & status) const1203 UnicodeString RegexMatcher::group(int32_t groupNum, UErrorCode &status) const {
1204 UnicodeString result;
1205 int64_t groupStart = start64(groupNum, status);
1206 int64_t groupEnd = end64(groupNum, status);
1207 if (U_FAILURE(status) || groupStart == -1 || groupStart == groupEnd) {
1208 return result;
1209 }
1210
1211 // Get the group length using a utext_extract preflight.
1212 // UText is actually pretty efficient at this when underlying encoding is UTF-16.
1213 int32_t length = utext_extract(fInputText, groupStart, groupEnd, NULL, 0, &status);
1214 if (status != U_BUFFER_OVERFLOW_ERROR) {
1215 return result;
1216 }
1217
1218 status = U_ZERO_ERROR;
1219 UChar *buf = result.getBuffer(length);
1220 if (buf == NULL) {
1221 status = U_MEMORY_ALLOCATION_ERROR;
1222 } else {
1223 int32_t extractLength = utext_extract(fInputText, groupStart, groupEnd, buf, length, &status);
1224 result.releaseBuffer(extractLength);
1225 U_ASSERT(length == extractLength);
1226 }
1227 return result;
1228 }
1229
1230
1231 //--------------------------------------------------------------------------------
1232 //
1233 // appendGroup() -- currently internal only, appends a group to a UText rather
1234 // than replacing its contents
1235 //
1236 //--------------------------------------------------------------------------------
1237
appendGroup(int32_t groupNum,UText * dest,UErrorCode & status) const1238 int64_t RegexMatcher::appendGroup(int32_t groupNum, UText *dest, UErrorCode &status) const {
1239 if (U_FAILURE(status)) {
1240 return 0;
1241 }
1242 if (U_FAILURE(fDeferredStatus)) {
1243 status = fDeferredStatus;
1244 return 0;
1245 }
1246 int64_t destLen = utext_nativeLength(dest);
1247
1248 if (fMatch == FALSE) {
1249 status = U_REGEX_INVALID_STATE;
1250 return utext_replace(dest, destLen, destLen, NULL, 0, &status);
1251 }
1252 if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) {
1253 status = U_INDEX_OUTOFBOUNDS_ERROR;
1254 return utext_replace(dest, destLen, destLen, NULL, 0, &status);
1255 }
1256
1257 int64_t s, e;
1258 if (groupNum == 0) {
1259 s = fMatchStart;
1260 e = fMatchEnd;
1261 } else {
1262 int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1);
1263 U_ASSERT(groupOffset < fPattern->fFrameSize);
1264 U_ASSERT(groupOffset >= 0);
1265 s = fFrame->fExtra[groupOffset];
1266 e = fFrame->fExtra[groupOffset+1];
1267 }
1268
1269 if (s < 0) {
1270 // A capture group wasn't part of the match
1271 return utext_replace(dest, destLen, destLen, NULL, 0, &status);
1272 }
1273 U_ASSERT(s <= e);
1274
1275 int64_t deltaLen;
1276 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1277 U_ASSERT(e <= fInputLength);
1278 deltaLen = utext_replace(dest, destLen, destLen, fInputText->chunkContents+s, (int32_t)(e-s), &status);
1279 } else {
1280 int32_t len16;
1281 if (UTEXT_USES_U16(fInputText)) {
1282 len16 = (int32_t)(e-s);
1283 } else {
1284 UErrorCode lengthStatus = U_ZERO_ERROR;
1285 len16 = utext_extract(fInputText, s, e, NULL, 0, &lengthStatus);
1286 }
1287 UChar *groupChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1));
1288 if (groupChars == NULL) {
1289 status = U_MEMORY_ALLOCATION_ERROR;
1290 return 0;
1291 }
1292 utext_extract(fInputText, s, e, groupChars, len16+1, &status);
1293
1294 deltaLen = utext_replace(dest, destLen, destLen, groupChars, len16, &status);
1295 uprv_free(groupChars);
1296 }
1297 return deltaLen;
1298 }
1299
1300
1301
1302 //--------------------------------------------------------------------------------
1303 //
1304 // groupCount()
1305 //
1306 //--------------------------------------------------------------------------------
groupCount() const1307 int32_t RegexMatcher::groupCount() const {
1308 return fPattern->fGroupMap->size();
1309 }
1310
1311 //--------------------------------------------------------------------------------
1312 //
1313 // hasAnchoringBounds()
1314 //
1315 //--------------------------------------------------------------------------------
hasAnchoringBounds() const1316 UBool RegexMatcher::hasAnchoringBounds() const {
1317 return fAnchoringBounds;
1318 }
1319
1320
1321 //--------------------------------------------------------------------------------
1322 //
1323 // hasTransparentBounds()
1324 //
1325 //--------------------------------------------------------------------------------
hasTransparentBounds() const1326 UBool RegexMatcher::hasTransparentBounds() const {
1327 return fTransparentBounds;
1328 }
1329
1330
1331
1332 //--------------------------------------------------------------------------------
1333 //
1334 // hitEnd()
1335 //
1336 //--------------------------------------------------------------------------------
hitEnd() const1337 UBool RegexMatcher::hitEnd() const {
1338 return fHitEnd;
1339 }
1340
1341
1342 //--------------------------------------------------------------------------------
1343 //
1344 // input()
1345 //
1346 //--------------------------------------------------------------------------------
input() const1347 const UnicodeString &RegexMatcher::input() const {
1348 if (!fInput) {
1349 UErrorCode status = U_ZERO_ERROR;
1350 int32_t len16;
1351 if (UTEXT_USES_U16(fInputText)) {
1352 len16 = (int32_t)fInputLength;
1353 } else {
1354 len16 = utext_extract(fInputText, 0, fInputLength, NULL, 0, &status);
1355 status = U_ZERO_ERROR; // overflow, length status
1356 }
1357 UnicodeString *result = new UnicodeString(len16, 0, 0);
1358
1359 UChar *inputChars = result->getBuffer(len16);
1360 utext_extract(fInputText, 0, fInputLength, inputChars, len16, &status); // unterminated warning
1361 result->releaseBuffer(len16);
1362
1363 (*(const UnicodeString **)&fInput) = result; // pointer assignment, rather than operator=
1364 }
1365
1366 return *fInput;
1367 }
1368
1369 //--------------------------------------------------------------------------------
1370 //
1371 // inputText()
1372 //
1373 //--------------------------------------------------------------------------------
inputText() const1374 UText *RegexMatcher::inputText() const {
1375 return fInputText;
1376 }
1377
1378
1379 //--------------------------------------------------------------------------------
1380 //
1381 // getInput() -- like inputText(), but makes a clone or copies into another UText
1382 //
1383 //--------------------------------------------------------------------------------
getInput(UText * dest,UErrorCode & status) const1384 UText *RegexMatcher::getInput (UText *dest, UErrorCode &status) const {
1385 if (U_FAILURE(status)) {
1386 return dest;
1387 }
1388 if (U_FAILURE(fDeferredStatus)) {
1389 status = fDeferredStatus;
1390 return dest;
1391 }
1392
1393 if (dest) {
1394 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1395 utext_replace(dest, 0, utext_nativeLength(dest), fInputText->chunkContents, (int32_t)fInputLength, &status);
1396 } else {
1397 int32_t input16Len;
1398 if (UTEXT_USES_U16(fInputText)) {
1399 input16Len = (int32_t)fInputLength;
1400 } else {
1401 UErrorCode lengthStatus = U_ZERO_ERROR;
1402 input16Len = utext_extract(fInputText, 0, fInputLength, NULL, 0, &lengthStatus); // buffer overflow error
1403 }
1404 UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(input16Len));
1405 if (inputChars == NULL) {
1406 return dest;
1407 }
1408
1409 status = U_ZERO_ERROR;
1410 utext_extract(fInputText, 0, fInputLength, inputChars, input16Len, &status); // not terminated warning
1411 status = U_ZERO_ERROR;
1412 utext_replace(dest, 0, utext_nativeLength(dest), inputChars, input16Len, &status);
1413
1414 uprv_free(inputChars);
1415 }
1416 return dest;
1417 } else {
1418 return utext_clone(NULL, fInputText, FALSE, TRUE, &status);
1419 }
1420 }
1421
1422
1423 static UBool compat_SyncMutableUTextContents(UText *ut);
compat_SyncMutableUTextContents(UText * ut)1424 static UBool compat_SyncMutableUTextContents(UText *ut) {
1425 UBool retVal = FALSE;
1426
1427 // In the following test, we're really only interested in whether the UText should switch
1428 // between heap and stack allocation. If length hasn't changed, we won't, so the chunkContents
1429 // will still point to the correct data.
1430 if (utext_nativeLength(ut) != ut->nativeIndexingLimit) {
1431 UnicodeString *us=(UnicodeString *)ut->context;
1432
1433 // Update to the latest length.
1434 // For example, (utext_nativeLength(ut) != ut->nativeIndexingLimit).
1435 int32_t newLength = us->length();
1436
1437 // Update the chunk description.
1438 // The buffer may have switched between stack- and heap-based.
1439 ut->chunkContents = us->getBuffer();
1440 ut->chunkLength = newLength;
1441 ut->chunkNativeLimit = newLength;
1442 ut->nativeIndexingLimit = newLength;
1443 retVal = TRUE;
1444 }
1445
1446 return retVal;
1447 }
1448
1449 //--------------------------------------------------------------------------------
1450 //
1451 // lookingAt()
1452 //
1453 //--------------------------------------------------------------------------------
lookingAt(UErrorCode & status)1454 UBool RegexMatcher::lookingAt(UErrorCode &status) {
1455 if (U_FAILURE(status)) {
1456 return FALSE;
1457 }
1458 if (U_FAILURE(fDeferredStatus)) {
1459 status = fDeferredStatus;
1460 return FALSE;
1461 }
1462
1463 if (fInputUniStrMaybeMutable) {
1464 if (compat_SyncMutableUTextContents(fInputText)) {
1465 fInputLength = utext_nativeLength(fInputText);
1466 reset();
1467 }
1468 }
1469 else {
1470 resetPreserveRegion();
1471 }
1472 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1473 MatchChunkAt((int32_t)fActiveStart, FALSE, status);
1474 } else {
1475 MatchAt(fActiveStart, FALSE, status);
1476 }
1477 return fMatch;
1478 }
1479
1480
lookingAt(int64_t start,UErrorCode & status)1481 UBool RegexMatcher::lookingAt(int64_t start, UErrorCode &status) {
1482 if (U_FAILURE(status)) {
1483 return FALSE;
1484 }
1485 if (U_FAILURE(fDeferredStatus)) {
1486 status = fDeferredStatus;
1487 return FALSE;
1488 }
1489 reset();
1490
1491 if (start < 0) {
1492 status = U_INDEX_OUTOFBOUNDS_ERROR;
1493 return FALSE;
1494 }
1495
1496 if (fInputUniStrMaybeMutable) {
1497 if (compat_SyncMutableUTextContents(fInputText)) {
1498 fInputLength = utext_nativeLength(fInputText);
1499 reset();
1500 }
1501 }
1502
1503 int64_t nativeStart;
1504 nativeStart = start;
1505 if (nativeStart < fActiveStart || nativeStart > fActiveLimit) {
1506 status = U_INDEX_OUTOFBOUNDS_ERROR;
1507 return FALSE;
1508 }
1509
1510 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1511 MatchChunkAt((int32_t)nativeStart, FALSE, status);
1512 } else {
1513 MatchAt(nativeStart, FALSE, status);
1514 }
1515 return fMatch;
1516 }
1517
1518
1519
1520 //--------------------------------------------------------------------------------
1521 //
1522 // matches()
1523 //
1524 //--------------------------------------------------------------------------------
matches(UErrorCode & status)1525 UBool RegexMatcher::matches(UErrorCode &status) {
1526 if (U_FAILURE(status)) {
1527 return FALSE;
1528 }
1529 if (U_FAILURE(fDeferredStatus)) {
1530 status = fDeferredStatus;
1531 return FALSE;
1532 }
1533
1534 if (fInputUniStrMaybeMutable) {
1535 if (compat_SyncMutableUTextContents(fInputText)) {
1536 fInputLength = utext_nativeLength(fInputText);
1537 reset();
1538 }
1539 }
1540 else {
1541 resetPreserveRegion();
1542 }
1543
1544 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1545 MatchChunkAt((int32_t)fActiveStart, TRUE, status);
1546 } else {
1547 MatchAt(fActiveStart, TRUE, status);
1548 }
1549 return fMatch;
1550 }
1551
1552
matches(int64_t start,UErrorCode & status)1553 UBool RegexMatcher::matches(int64_t start, UErrorCode &status) {
1554 if (U_FAILURE(status)) {
1555 return FALSE;
1556 }
1557 if (U_FAILURE(fDeferredStatus)) {
1558 status = fDeferredStatus;
1559 return FALSE;
1560 }
1561 reset();
1562
1563 if (start < 0) {
1564 status = U_INDEX_OUTOFBOUNDS_ERROR;
1565 return FALSE;
1566 }
1567
1568 if (fInputUniStrMaybeMutable) {
1569 if (compat_SyncMutableUTextContents(fInputText)) {
1570 fInputLength = utext_nativeLength(fInputText);
1571 reset();
1572 }
1573 }
1574
1575 int64_t nativeStart;
1576 nativeStart = start;
1577 if (nativeStart < fActiveStart || nativeStart > fActiveLimit) {
1578 status = U_INDEX_OUTOFBOUNDS_ERROR;
1579 return FALSE;
1580 }
1581
1582 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1583 MatchChunkAt((int32_t)nativeStart, TRUE, status);
1584 } else {
1585 MatchAt(nativeStart, TRUE, status);
1586 }
1587 return fMatch;
1588 }
1589
1590
1591
1592 //--------------------------------------------------------------------------------
1593 //
1594 // pattern
1595 //
1596 //--------------------------------------------------------------------------------
pattern() const1597 const RegexPattern &RegexMatcher::pattern() const {
1598 return *fPattern;
1599 }
1600
1601
1602
1603 //--------------------------------------------------------------------------------
1604 //
1605 // region
1606 //
1607 //--------------------------------------------------------------------------------
region(int64_t regionStart,int64_t regionLimit,int64_t startIndex,UErrorCode & status)1608 RegexMatcher &RegexMatcher::region(int64_t regionStart, int64_t regionLimit, int64_t startIndex, UErrorCode &status) {
1609 if (U_FAILURE(status)) {
1610 return *this;
1611 }
1612
1613 if (regionStart>regionLimit || regionStart<0 || regionLimit<0) {
1614 status = U_ILLEGAL_ARGUMENT_ERROR;
1615 }
1616
1617 int64_t nativeStart = regionStart;
1618 int64_t nativeLimit = regionLimit;
1619 if (nativeStart > fInputLength || nativeLimit > fInputLength) {
1620 status = U_ILLEGAL_ARGUMENT_ERROR;
1621 }
1622
1623 if (startIndex == -1)
1624 this->reset();
1625 else
1626 resetPreserveRegion();
1627
1628 fRegionStart = nativeStart;
1629 fRegionLimit = nativeLimit;
1630 fActiveStart = nativeStart;
1631 fActiveLimit = nativeLimit;
1632
1633 if (startIndex != -1) {
1634 if (startIndex < fActiveStart || startIndex > fActiveLimit) {
1635 status = U_INDEX_OUTOFBOUNDS_ERROR;
1636 }
1637 fMatchEnd = startIndex;
1638 }
1639
1640 if (!fTransparentBounds) {
1641 fLookStart = nativeStart;
1642 fLookLimit = nativeLimit;
1643 }
1644 if (fAnchoringBounds) {
1645 fAnchorStart = nativeStart;
1646 fAnchorLimit = nativeLimit;
1647 }
1648 return *this;
1649 }
1650
region(int64_t start,int64_t limit,UErrorCode & status)1651 RegexMatcher &RegexMatcher::region(int64_t start, int64_t limit, UErrorCode &status) {
1652 return region(start, limit, -1, status);
1653 }
1654
1655 //--------------------------------------------------------------------------------
1656 //
1657 // regionEnd
1658 //
1659 //--------------------------------------------------------------------------------
regionEnd() const1660 int32_t RegexMatcher::regionEnd() const {
1661 return (int32_t)fRegionLimit;
1662 }
1663
regionEnd64() const1664 int64_t RegexMatcher::regionEnd64() const {
1665 return fRegionLimit;
1666 }
1667
1668 //--------------------------------------------------------------------------------
1669 //
1670 // regionStart
1671 //
1672 //--------------------------------------------------------------------------------
regionStart() const1673 int32_t RegexMatcher::regionStart() const {
1674 return (int32_t)fRegionStart;
1675 }
1676
regionStart64() const1677 int64_t RegexMatcher::regionStart64() const {
1678 return fRegionStart;
1679 }
1680
1681
1682 //--------------------------------------------------------------------------------
1683 //
1684 // replaceAll
1685 //
1686 //--------------------------------------------------------------------------------
replaceAll(const UnicodeString & replacement,UErrorCode & status)1687 UnicodeString RegexMatcher::replaceAll(const UnicodeString &replacement, UErrorCode &status) {
1688 UText replacementText = UTEXT_INITIALIZER;
1689 UText resultText = UTEXT_INITIALIZER;
1690 UnicodeString resultString;
1691 if (U_FAILURE(status)) {
1692 return resultString;
1693 }
1694
1695 utext_openConstUnicodeString(&replacementText, &replacement, &status);
1696 utext_openUnicodeString(&resultText, &resultString, &status);
1697
1698 replaceAll(&replacementText, &resultText, status);
1699
1700 utext_close(&resultText);
1701 utext_close(&replacementText);
1702
1703 return resultString;
1704 }
1705
1706
1707 //
1708 // replaceAll, UText mode
1709 //
replaceAll(UText * replacement,UText * dest,UErrorCode & status)1710 UText *RegexMatcher::replaceAll(UText *replacement, UText *dest, UErrorCode &status) {
1711 if (U_FAILURE(status)) {
1712 return dest;
1713 }
1714 if (U_FAILURE(fDeferredStatus)) {
1715 status = fDeferredStatus;
1716 return dest;
1717 }
1718
1719 if (dest == NULL) {
1720 UnicodeString emptyString;
1721 UText empty = UTEXT_INITIALIZER;
1722
1723 utext_openUnicodeString(&empty, &emptyString, &status);
1724 dest = utext_clone(NULL, &empty, TRUE, FALSE, &status);
1725 utext_close(&empty);
1726 }
1727
1728 if (U_SUCCESS(status)) {
1729 reset();
1730 while (find()) {
1731 appendReplacement(dest, replacement, status);
1732 if (U_FAILURE(status)) {
1733 break;
1734 }
1735 }
1736 appendTail(dest, status);
1737 }
1738
1739 return dest;
1740 }
1741
1742
1743 //--------------------------------------------------------------------------------
1744 //
1745 // replaceFirst
1746 //
1747 //--------------------------------------------------------------------------------
replaceFirst(const UnicodeString & replacement,UErrorCode & status)1748 UnicodeString RegexMatcher::replaceFirst(const UnicodeString &replacement, UErrorCode &status) {
1749 UText replacementText = UTEXT_INITIALIZER;
1750 UText resultText = UTEXT_INITIALIZER;
1751 UnicodeString resultString;
1752
1753 utext_openConstUnicodeString(&replacementText, &replacement, &status);
1754 utext_openUnicodeString(&resultText, &resultString, &status);
1755
1756 replaceFirst(&replacementText, &resultText, status);
1757
1758 utext_close(&resultText);
1759 utext_close(&replacementText);
1760
1761 return resultString;
1762 }
1763
1764 //
1765 // replaceFirst, UText mode
1766 //
replaceFirst(UText * replacement,UText * dest,UErrorCode & status)1767 UText *RegexMatcher::replaceFirst(UText *replacement, UText *dest, UErrorCode &status) {
1768 if (U_FAILURE(status)) {
1769 return dest;
1770 }
1771 if (U_FAILURE(fDeferredStatus)) {
1772 status = fDeferredStatus;
1773 return dest;
1774 }
1775
1776 reset();
1777 if (!find()) {
1778 return getInput(dest, status);
1779 }
1780
1781 if (dest == NULL) {
1782 UnicodeString emptyString;
1783 UText empty = UTEXT_INITIALIZER;
1784
1785 utext_openUnicodeString(&empty, &emptyString, &status);
1786 dest = utext_clone(NULL, &empty, TRUE, FALSE, &status);
1787 utext_close(&empty);
1788 }
1789
1790 appendReplacement(dest, replacement, status);
1791 appendTail(dest, status);
1792
1793 return dest;
1794 }
1795
1796
1797 //--------------------------------------------------------------------------------
1798 //
1799 // requireEnd
1800 //
1801 //--------------------------------------------------------------------------------
requireEnd() const1802 UBool RegexMatcher::requireEnd() const {
1803 return fRequireEnd;
1804 }
1805
1806
1807 //--------------------------------------------------------------------------------
1808 //
1809 // reset
1810 //
1811 //--------------------------------------------------------------------------------
reset()1812 RegexMatcher &RegexMatcher::reset() {
1813 fRegionStart = 0;
1814 fRegionLimit = fInputLength;
1815 fActiveStart = 0;
1816 fActiveLimit = fInputLength;
1817 fAnchorStart = 0;
1818 fAnchorLimit = fInputLength;
1819 fLookStart = 0;
1820 fLookLimit = fInputLength;
1821 resetPreserveRegion();
1822 return *this;
1823 }
1824
1825
1826
resetPreserveRegion()1827 void RegexMatcher::resetPreserveRegion() {
1828 fMatchStart = 0;
1829 fMatchEnd = 0;
1830 fLastMatchEnd = -1;
1831 fAppendPosition = 0;
1832 fMatch = FALSE;
1833 fHitEnd = FALSE;
1834 fRequireEnd = FALSE;
1835 fTime = 0;
1836 fTickCounter = TIMER_INITIAL_VALUE;
1837 //resetStack(); // more expensive than it looks...
1838 }
1839
1840
reset(const UnicodeString & input)1841 RegexMatcher &RegexMatcher::reset(const UnicodeString &input) {
1842 fInputText = utext_openConstUnicodeString(fInputText, &input, &fDeferredStatus);
1843 if (fPattern->fNeedsAltInput) {
1844 fAltInputText = utext_clone(fAltInputText, fInputText, FALSE, TRUE, &fDeferredStatus);
1845 }
1846 if (U_FAILURE(fDeferredStatus)) {
1847 return *this;
1848 }
1849 fInputLength = utext_nativeLength(fInputText);
1850
1851 reset();
1852 delete fInput;
1853 fInput = NULL;
1854
1855 // Do the following for any UnicodeString.
1856 // This is for compatibility for those clients who modify the input string "live" during regex operations.
1857 fInputUniStrMaybeMutable = TRUE;
1858
1859 if (fWordBreakItr != NULL) {
1860 #if UCONFIG_NO_BREAK_ITERATION==0
1861 UErrorCode status = U_ZERO_ERROR;
1862 fWordBreakItr->setText(fInputText, status);
1863 #endif
1864 }
1865 return *this;
1866 }
1867
1868
reset(UText * input)1869 RegexMatcher &RegexMatcher::reset(UText *input) {
1870 if (fInputText != input) {
1871 fInputText = utext_clone(fInputText, input, FALSE, TRUE, &fDeferredStatus);
1872 if (fPattern->fNeedsAltInput) fAltInputText = utext_clone(fAltInputText, fInputText, FALSE, TRUE, &fDeferredStatus);
1873 if (U_FAILURE(fDeferredStatus)) {
1874 return *this;
1875 }
1876 fInputLength = utext_nativeLength(fInputText);
1877
1878 delete fInput;
1879 fInput = NULL;
1880
1881 if (fWordBreakItr != NULL) {
1882 #if UCONFIG_NO_BREAK_ITERATION==0
1883 UErrorCode status = U_ZERO_ERROR;
1884 fWordBreakItr->setText(input, status);
1885 #endif
1886 }
1887 }
1888 reset();
1889 fInputUniStrMaybeMutable = FALSE;
1890
1891 return *this;
1892 }
1893
1894 /*RegexMatcher &RegexMatcher::reset(const UChar *) {
1895 fDeferredStatus = U_INTERNAL_PROGRAM_ERROR;
1896 return *this;
1897 }*/
1898
reset(int64_t position,UErrorCode & status)1899 RegexMatcher &RegexMatcher::reset(int64_t position, UErrorCode &status) {
1900 if (U_FAILURE(status)) {
1901 return *this;
1902 }
1903 reset(); // Reset also resets the region to be the entire string.
1904
1905 if (position < 0 || position > fActiveLimit) {
1906 status = U_INDEX_OUTOFBOUNDS_ERROR;
1907 return *this;
1908 }
1909 fMatchEnd = position;
1910 return *this;
1911 }
1912
1913
1914 //--------------------------------------------------------------------------------
1915 //
1916 // refresh
1917 //
1918 //--------------------------------------------------------------------------------
refreshInputText(UText * input,UErrorCode & status)1919 RegexMatcher &RegexMatcher::refreshInputText(UText *input, UErrorCode &status) {
1920 if (U_FAILURE(status)) {
1921 return *this;
1922 }
1923 if (input == NULL) {
1924 status = U_ILLEGAL_ARGUMENT_ERROR;
1925 return *this;
1926 }
1927 if (utext_nativeLength(fInputText) != utext_nativeLength(input)) {
1928 status = U_ILLEGAL_ARGUMENT_ERROR;
1929 return *this;
1930 }
1931 int64_t pos = utext_getNativeIndex(fInputText);
1932 // Shallow read-only clone of the new UText into the existing input UText
1933 fInputText = utext_clone(fInputText, input, FALSE, TRUE, &status);
1934 if (U_FAILURE(status)) {
1935 return *this;
1936 }
1937 utext_setNativeIndex(fInputText, pos);
1938
1939 if (fAltInputText != NULL) {
1940 pos = utext_getNativeIndex(fAltInputText);
1941 fAltInputText = utext_clone(fAltInputText, input, FALSE, TRUE, &status);
1942 if (U_FAILURE(status)) {
1943 return *this;
1944 }
1945 utext_setNativeIndex(fAltInputText, pos);
1946 }
1947 return *this;
1948 }
1949
1950
1951
1952 //--------------------------------------------------------------------------------
1953 //
1954 // setTrace
1955 //
1956 //--------------------------------------------------------------------------------
setTrace(UBool state)1957 void RegexMatcher::setTrace(UBool state) {
1958 fTraceDebug = state;
1959 }
1960
1961
1962
1963 /**
1964 * UText, replace entire contents of the destination UText with a substring of the source UText.
1965 *
1966 * @param src The source UText
1967 * @param dest The destination UText. Must be writable.
1968 * May be NULL, in which case a new UText will be allocated.
1969 * @param start Start index of source substring.
1970 * @param limit Limit index of source substring.
1971 * @param status An error code.
1972 */
utext_extract_replace(UText * src,UText * dest,int64_t start,int64_t limit,UErrorCode * status)1973 static UText *utext_extract_replace(UText *src, UText *dest, int64_t start, int64_t limit, UErrorCode *status) {
1974 if (U_FAILURE(*status)) {
1975 return dest;
1976 }
1977 if (start == limit) {
1978 if (dest) {
1979 utext_replace(dest, 0, utext_nativeLength(dest), NULL, 0, status);
1980 return dest;
1981 } else {
1982 return utext_openUChars(NULL, NULL, 0, status);
1983 }
1984 }
1985 int32_t length = utext_extract(src, start, limit, NULL, 0, status);
1986 if (*status != U_BUFFER_OVERFLOW_ERROR && U_FAILURE(*status)) {
1987 return dest;
1988 }
1989 *status = U_ZERO_ERROR;
1990 MaybeStackArray<UChar, 40> buffer;
1991 if (length >= buffer.getCapacity()) {
1992 UChar *newBuf = buffer.resize(length+1); // Leave space for terminating Nul.
1993 if (newBuf == NULL) {
1994 *status = U_MEMORY_ALLOCATION_ERROR;
1995 }
1996 }
1997 utext_extract(src, start, limit, buffer.getAlias(), length+1, status);
1998 if (dest) {
1999 utext_replace(dest, 0, utext_nativeLength(dest), buffer.getAlias(), length, status);
2000 return dest;
2001 }
2002
2003 // Caller did not provide a prexisting UText.
2004 // Open a new one, and have it adopt the text buffer storage.
2005 if (U_FAILURE(*status)) {
2006 return NULL;
2007 }
2008 int32_t ownedLength = 0;
2009 UChar *ownedBuf = buffer.orphanOrClone(length+1, ownedLength);
2010 if (ownedBuf == NULL) {
2011 *status = U_MEMORY_ALLOCATION_ERROR;
2012 return NULL;
2013 }
2014 UText *result = utext_openUChars(NULL, ownedBuf, length, status);
2015 if (U_FAILURE(*status)) {
2016 uprv_free(ownedBuf);
2017 return NULL;
2018 }
2019 result->providerProperties |= (1 << UTEXT_PROVIDER_OWNS_TEXT);
2020 return result;
2021 }
2022
2023
2024 //---------------------------------------------------------------------
2025 //
2026 // split
2027 //
2028 //---------------------------------------------------------------------
split(const UnicodeString & input,UnicodeString dest[],int32_t destCapacity,UErrorCode & status)2029 int32_t RegexMatcher::split(const UnicodeString &input,
2030 UnicodeString dest[],
2031 int32_t destCapacity,
2032 UErrorCode &status)
2033 {
2034 UText inputText = UTEXT_INITIALIZER;
2035 utext_openConstUnicodeString(&inputText, &input, &status);
2036 if (U_FAILURE(status)) {
2037 return 0;
2038 }
2039
2040 UText **destText = (UText **)uprv_malloc(sizeof(UText*)*destCapacity);
2041 if (destText == NULL) {
2042 status = U_MEMORY_ALLOCATION_ERROR;
2043 return 0;
2044 }
2045 int32_t i;
2046 for (i = 0; i < destCapacity; i++) {
2047 destText[i] = utext_openUnicodeString(NULL, &dest[i], &status);
2048 }
2049
2050 int32_t fieldCount = split(&inputText, destText, destCapacity, status);
2051
2052 for (i = 0; i < destCapacity; i++) {
2053 utext_close(destText[i]);
2054 }
2055
2056 uprv_free(destText);
2057 utext_close(&inputText);
2058 return fieldCount;
2059 }
2060
2061 //
2062 // split, UText mode
2063 //
split(UText * input,UText * dest[],int32_t destCapacity,UErrorCode & status)2064 int32_t RegexMatcher::split(UText *input,
2065 UText *dest[],
2066 int32_t destCapacity,
2067 UErrorCode &status)
2068 {
2069 //
2070 // Check arguements for validity
2071 //
2072 if (U_FAILURE(status)) {
2073 return 0;
2074 };
2075
2076 if (destCapacity < 1) {
2077 status = U_ILLEGAL_ARGUMENT_ERROR;
2078 return 0;
2079 }
2080
2081 //
2082 // Reset for the input text
2083 //
2084 reset(input);
2085 int64_t nextOutputStringStart = 0;
2086 if (fActiveLimit == 0) {
2087 return 0;
2088 }
2089
2090 //
2091 // Loop through the input text, searching for the delimiter pattern
2092 //
2093 int32_t i;
2094 int32_t numCaptureGroups = fPattern->fGroupMap->size();
2095 for (i=0; ; i++) {
2096 if (i>=destCapacity-1) {
2097 // There is one or zero output string left.
2098 // Fill the last output string with whatever is left from the input, then exit the loop.
2099 // ( i will be == destCapacity if we filled the output array while processing
2100 // capture groups of the delimiter expression, in which case we will discard the
2101 // last capture group saved in favor of the unprocessed remainder of the
2102 // input string.)
2103 i = destCapacity-1;
2104 if (fActiveLimit > nextOutputStringStart) {
2105 if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) {
2106 if (dest[i]) {
2107 utext_replace(dest[i], 0, utext_nativeLength(dest[i]),
2108 input->chunkContents+nextOutputStringStart,
2109 (int32_t)(fActiveLimit-nextOutputStringStart), &status);
2110 } else {
2111 UText remainingText = UTEXT_INITIALIZER;
2112 utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart,
2113 fActiveLimit-nextOutputStringStart, &status);
2114 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2115 utext_close(&remainingText);
2116 }
2117 } else {
2118 UErrorCode lengthStatus = U_ZERO_ERROR;
2119 int32_t remaining16Length =
2120 utext_extract(input, nextOutputStringStart, fActiveLimit, NULL, 0, &lengthStatus);
2121 UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1));
2122 if (remainingChars == NULL) {
2123 status = U_MEMORY_ALLOCATION_ERROR;
2124 break;
2125 }
2126
2127 utext_extract(input, nextOutputStringStart, fActiveLimit, remainingChars, remaining16Length+1, &status);
2128 if (dest[i]) {
2129 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status);
2130 } else {
2131 UText remainingText = UTEXT_INITIALIZER;
2132 utext_openUChars(&remainingText, remainingChars, remaining16Length, &status);
2133 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2134 utext_close(&remainingText);
2135 }
2136
2137 uprv_free(remainingChars);
2138 }
2139 }
2140 break;
2141 }
2142 if (find()) {
2143 // We found another delimiter. Move everything from where we started looking
2144 // up until the start of the delimiter into the next output string.
2145 if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) {
2146 if (dest[i]) {
2147 utext_replace(dest[i], 0, utext_nativeLength(dest[i]),
2148 input->chunkContents+nextOutputStringStart,
2149 (int32_t)(fMatchStart-nextOutputStringStart), &status);
2150 } else {
2151 UText remainingText = UTEXT_INITIALIZER;
2152 utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart,
2153 fMatchStart-nextOutputStringStart, &status);
2154 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2155 utext_close(&remainingText);
2156 }
2157 } else {
2158 UErrorCode lengthStatus = U_ZERO_ERROR;
2159 int32_t remaining16Length = utext_extract(input, nextOutputStringStart, fMatchStart, NULL, 0, &lengthStatus);
2160 UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1));
2161 if (remainingChars == NULL) {
2162 status = U_MEMORY_ALLOCATION_ERROR;
2163 break;
2164 }
2165 utext_extract(input, nextOutputStringStart, fMatchStart, remainingChars, remaining16Length+1, &status);
2166 if (dest[i]) {
2167 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status);
2168 } else {
2169 UText remainingText = UTEXT_INITIALIZER;
2170 utext_openUChars(&remainingText, remainingChars, remaining16Length, &status);
2171 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2172 utext_close(&remainingText);
2173 }
2174
2175 uprv_free(remainingChars);
2176 }
2177 nextOutputStringStart = fMatchEnd;
2178
2179 // If the delimiter pattern has capturing parentheses, the captured
2180 // text goes out into the next n destination strings.
2181 int32_t groupNum;
2182 for (groupNum=1; groupNum<=numCaptureGroups; groupNum++) {
2183 if (i >= destCapacity-2) {
2184 // Never fill the last available output string with capture group text.
2185 // It will filled with the last field, the remainder of the
2186 // unsplit input text.
2187 break;
2188 }
2189 i++;
2190 dest[i] = utext_extract_replace(fInputText, dest[i],
2191 start64(groupNum, status), end64(groupNum, status), &status);
2192 }
2193
2194 if (nextOutputStringStart == fActiveLimit) {
2195 // The delimiter was at the end of the string. We're done, but first
2196 // we output one last empty string, for the empty field following
2197 // the delimiter at the end of input.
2198 if (i+1 < destCapacity) {
2199 ++i;
2200 if (dest[i] == NULL) {
2201 dest[i] = utext_openUChars(NULL, NULL, 0, &status);
2202 } else {
2203 static UChar emptyString[] = {(UChar)0};
2204 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), emptyString, 0, &status);
2205 }
2206 }
2207 break;
2208
2209 }
2210 }
2211 else
2212 {
2213 // We ran off the end of the input while looking for the next delimiter.
2214 // All the remaining text goes into the current output string.
2215 if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) {
2216 if (dest[i]) {
2217 utext_replace(dest[i], 0, utext_nativeLength(dest[i]),
2218 input->chunkContents+nextOutputStringStart,
2219 (int32_t)(fActiveLimit-nextOutputStringStart), &status);
2220 } else {
2221 UText remainingText = UTEXT_INITIALIZER;
2222 utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart,
2223 fActiveLimit-nextOutputStringStart, &status);
2224 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2225 utext_close(&remainingText);
2226 }
2227 } else {
2228 UErrorCode lengthStatus = U_ZERO_ERROR;
2229 int32_t remaining16Length = utext_extract(input, nextOutputStringStart, fActiveLimit, NULL, 0, &lengthStatus);
2230 UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1));
2231 if (remainingChars == NULL) {
2232 status = U_MEMORY_ALLOCATION_ERROR;
2233 break;
2234 }
2235
2236 utext_extract(input, nextOutputStringStart, fActiveLimit, remainingChars, remaining16Length+1, &status);
2237 if (dest[i]) {
2238 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status);
2239 } else {
2240 UText remainingText = UTEXT_INITIALIZER;
2241 utext_openUChars(&remainingText, remainingChars, remaining16Length, &status);
2242 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2243 utext_close(&remainingText);
2244 }
2245
2246 uprv_free(remainingChars);
2247 }
2248 break;
2249 }
2250 if (U_FAILURE(status)) {
2251 break;
2252 }
2253 } // end of for loop
2254 return i+1;
2255 }
2256
2257
2258 //--------------------------------------------------------------------------------
2259 //
2260 // start
2261 //
2262 //--------------------------------------------------------------------------------
start(UErrorCode & status) const2263 int32_t RegexMatcher::start(UErrorCode &status) const {
2264 return start(0, status);
2265 }
2266
start64(UErrorCode & status) const2267 int64_t RegexMatcher::start64(UErrorCode &status) const {
2268 return start64(0, status);
2269 }
2270
2271 //--------------------------------------------------------------------------------
2272 //
2273 // start(int32_t group, UErrorCode &status)
2274 //
2275 //--------------------------------------------------------------------------------
2276
start64(int32_t group,UErrorCode & status) const2277 int64_t RegexMatcher::start64(int32_t group, UErrorCode &status) const {
2278 if (U_FAILURE(status)) {
2279 return -1;
2280 }
2281 if (U_FAILURE(fDeferredStatus)) {
2282 status = fDeferredStatus;
2283 return -1;
2284 }
2285 if (fMatch == FALSE) {
2286 status = U_REGEX_INVALID_STATE;
2287 return -1;
2288 }
2289 if (group < 0 || group > fPattern->fGroupMap->size()) {
2290 status = U_INDEX_OUTOFBOUNDS_ERROR;
2291 return -1;
2292 }
2293 int64_t s;
2294 if (group == 0) {
2295 s = fMatchStart;
2296 } else {
2297 int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1);
2298 U_ASSERT(groupOffset < fPattern->fFrameSize);
2299 U_ASSERT(groupOffset >= 0);
2300 s = fFrame->fExtra[groupOffset];
2301 }
2302
2303 return s;
2304 }
2305
2306
start(int32_t group,UErrorCode & status) const2307 int32_t RegexMatcher::start(int32_t group, UErrorCode &status) const {
2308 return (int32_t)start64(group, status);
2309 }
2310
2311 //--------------------------------------------------------------------------------
2312 //
2313 // useAnchoringBounds
2314 //
2315 //--------------------------------------------------------------------------------
useAnchoringBounds(UBool b)2316 RegexMatcher &RegexMatcher::useAnchoringBounds(UBool b) {
2317 fAnchoringBounds = b;
2318 fAnchorStart = (fAnchoringBounds ? fRegionStart : 0);
2319 fAnchorLimit = (fAnchoringBounds ? fRegionLimit : fInputLength);
2320 return *this;
2321 }
2322
2323
2324 //--------------------------------------------------------------------------------
2325 //
2326 // useTransparentBounds
2327 //
2328 //--------------------------------------------------------------------------------
useTransparentBounds(UBool b)2329 RegexMatcher &RegexMatcher::useTransparentBounds(UBool b) {
2330 fTransparentBounds = b;
2331 fLookStart = (fTransparentBounds ? 0 : fRegionStart);
2332 fLookLimit = (fTransparentBounds ? fInputLength : fRegionLimit);
2333 return *this;
2334 }
2335
2336 //--------------------------------------------------------------------------------
2337 //
2338 // setTimeLimit
2339 //
2340 //--------------------------------------------------------------------------------
setTimeLimit(int32_t limit,UErrorCode & status)2341 void RegexMatcher::setTimeLimit(int32_t limit, UErrorCode &status) {
2342 if (U_FAILURE(status)) {
2343 return;
2344 }
2345 if (U_FAILURE(fDeferredStatus)) {
2346 status = fDeferredStatus;
2347 return;
2348 }
2349 if (limit < 0) {
2350 status = U_ILLEGAL_ARGUMENT_ERROR;
2351 return;
2352 }
2353 fTimeLimit = limit;
2354 }
2355
2356
2357 //--------------------------------------------------------------------------------
2358 //
2359 // getTimeLimit
2360 //
2361 //--------------------------------------------------------------------------------
getTimeLimit() const2362 int32_t RegexMatcher::getTimeLimit() const {
2363 return fTimeLimit;
2364 }
2365
2366
2367 //--------------------------------------------------------------------------------
2368 //
2369 // setStackLimit
2370 //
2371 //--------------------------------------------------------------------------------
setStackLimit(int32_t limit,UErrorCode & status)2372 void RegexMatcher::setStackLimit(int32_t limit, UErrorCode &status) {
2373 if (U_FAILURE(status)) {
2374 return;
2375 }
2376 if (U_FAILURE(fDeferredStatus)) {
2377 status = fDeferredStatus;
2378 return;
2379 }
2380 if (limit < 0) {
2381 status = U_ILLEGAL_ARGUMENT_ERROR;
2382 return;
2383 }
2384
2385 // Reset the matcher. This is needed here in case there is a current match
2386 // whose final stack frame (containing the match results, pointed to by fFrame)
2387 // would be lost by resizing to a smaller stack size.
2388 reset();
2389
2390 if (limit == 0) {
2391 // Unlimited stack expansion
2392 fStack->setMaxCapacity(0);
2393 } else {
2394 // Change the units of the limit from bytes to ints, and bump the size up
2395 // to be big enough to hold at least one stack frame for the pattern,
2396 // if it isn't there already.
2397 int32_t adjustedLimit = limit / sizeof(int32_t);
2398 if (adjustedLimit < fPattern->fFrameSize) {
2399 adjustedLimit = fPattern->fFrameSize;
2400 }
2401 fStack->setMaxCapacity(adjustedLimit);
2402 }
2403 fStackLimit = limit;
2404 }
2405
2406
2407 //--------------------------------------------------------------------------------
2408 //
2409 // getStackLimit
2410 //
2411 //--------------------------------------------------------------------------------
getStackLimit() const2412 int32_t RegexMatcher::getStackLimit() const {
2413 return fStackLimit;
2414 }
2415
2416
2417 //--------------------------------------------------------------------------------
2418 //
2419 // setMatchCallback
2420 //
2421 //--------------------------------------------------------------------------------
setMatchCallback(URegexMatchCallback * callback,const void * context,UErrorCode & status)2422 void RegexMatcher::setMatchCallback(URegexMatchCallback *callback,
2423 const void *context,
2424 UErrorCode &status) {
2425 if (U_FAILURE(status)) {
2426 return;
2427 }
2428 fCallbackFn = callback;
2429 fCallbackContext = context;
2430 }
2431
2432
2433 //--------------------------------------------------------------------------------
2434 //
2435 // getMatchCallback
2436 //
2437 //--------------------------------------------------------------------------------
getMatchCallback(URegexMatchCallback * & callback,const void * & context,UErrorCode & status)2438 void RegexMatcher::getMatchCallback(URegexMatchCallback *&callback,
2439 const void *&context,
2440 UErrorCode &status) {
2441 if (U_FAILURE(status)) {
2442 return;
2443 }
2444 callback = fCallbackFn;
2445 context = fCallbackContext;
2446 }
2447
2448
2449 //--------------------------------------------------------------------------------
2450 //
2451 // setMatchCallback
2452 //
2453 //--------------------------------------------------------------------------------
setFindProgressCallback(URegexFindProgressCallback * callback,const void * context,UErrorCode & status)2454 void RegexMatcher::setFindProgressCallback(URegexFindProgressCallback *callback,
2455 const void *context,
2456 UErrorCode &status) {
2457 if (U_FAILURE(status)) {
2458 return;
2459 }
2460 fFindProgressCallbackFn = callback;
2461 fFindProgressCallbackContext = context;
2462 }
2463
2464
2465 //--------------------------------------------------------------------------------
2466 //
2467 // getMatchCallback
2468 //
2469 //--------------------------------------------------------------------------------
getFindProgressCallback(URegexFindProgressCallback * & callback,const void * & context,UErrorCode & status)2470 void RegexMatcher::getFindProgressCallback(URegexFindProgressCallback *&callback,
2471 const void *&context,
2472 UErrorCode &status) {
2473 if (U_FAILURE(status)) {
2474 return;
2475 }
2476 callback = fFindProgressCallbackFn;
2477 context = fFindProgressCallbackContext;
2478 }
2479
2480
2481 //================================================================================
2482 //
2483 // Code following this point in this file is the internal
2484 // Match Engine Implementation.
2485 //
2486 //================================================================================
2487
2488
2489 //--------------------------------------------------------------------------------
2490 //
2491 // resetStack
2492 // Discard any previous contents of the state save stack, and initialize a
2493 // new stack frame to all -1. The -1s are needed for capture group limits,
2494 // where they indicate that a group has not yet matched anything.
2495 //--------------------------------------------------------------------------------
resetStack()2496 REStackFrame *RegexMatcher::resetStack() {
2497 // Discard any previous contents of the state save stack, and initialize a
2498 // new stack frame with all -1 data. The -1s are needed for capture group limits,
2499 // where they indicate that a group has not yet matched anything.
2500 fStack->removeAllElements();
2501
2502 REStackFrame *iFrame = (REStackFrame *)fStack->reserveBlock(fPattern->fFrameSize, fDeferredStatus);
2503 if(U_FAILURE(fDeferredStatus)) {
2504 return NULL;
2505 }
2506
2507 int32_t i;
2508 for (i=0; i<fPattern->fFrameSize-RESTACKFRAME_HDRCOUNT; i++) {
2509 iFrame->fExtra[i] = -1;
2510 }
2511 return iFrame;
2512 }
2513
2514
2515
2516 //--------------------------------------------------------------------------------
2517 //
2518 // isWordBoundary
2519 // in perl, "xab..cd..", \b is true at positions 0,3,5,7
2520 // For us,
2521 // If the current char is a combining mark,
2522 // \b is FALSE.
2523 // Else Scan backwards to the first non-combining char.
2524 // We are at a boundary if the this char and the original chars are
2525 // opposite in membership in \w set
2526 //
2527 // parameters: pos - the current position in the input buffer
2528 //
2529 // TODO: double-check edge cases at region boundaries.
2530 //
2531 //--------------------------------------------------------------------------------
isWordBoundary(int64_t pos)2532 UBool RegexMatcher::isWordBoundary(int64_t pos) {
2533 UBool isBoundary = FALSE;
2534 UBool cIsWord = FALSE;
2535
2536 if (pos >= fLookLimit) {
2537 fHitEnd = TRUE;
2538 } else {
2539 // Determine whether char c at current position is a member of the word set of chars.
2540 // If we're off the end of the string, behave as though we're not at a word char.
2541 UTEXT_SETNATIVEINDEX(fInputText, pos);
2542 UChar32 c = UTEXT_CURRENT32(fInputText);
2543 if (u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND) || u_charType(c) == U_FORMAT_CHAR) {
2544 // Current char is a combining one. Not a boundary.
2545 return FALSE;
2546 }
2547 cIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(c);
2548 }
2549
2550 // Back up until we come to a non-combining char, determine whether
2551 // that char is a word char.
2552 UBool prevCIsWord = FALSE;
2553 for (;;) {
2554 if (UTEXT_GETNATIVEINDEX(fInputText) <= fLookStart) {
2555 break;
2556 }
2557 UChar32 prevChar = UTEXT_PREVIOUS32(fInputText);
2558 if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND)
2559 || u_charType(prevChar) == U_FORMAT_CHAR)) {
2560 prevCIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(prevChar);
2561 break;
2562 }
2563 }
2564 isBoundary = cIsWord ^ prevCIsWord;
2565 return isBoundary;
2566 }
2567
isChunkWordBoundary(int32_t pos)2568 UBool RegexMatcher::isChunkWordBoundary(int32_t pos) {
2569 UBool isBoundary = FALSE;
2570 UBool cIsWord = FALSE;
2571
2572 const UChar *inputBuf = fInputText->chunkContents;
2573
2574 if (pos >= fLookLimit) {
2575 fHitEnd = TRUE;
2576 } else {
2577 // Determine whether char c at current position is a member of the word set of chars.
2578 // If we're off the end of the string, behave as though we're not at a word char.
2579 UChar32 c;
2580 U16_GET(inputBuf, fLookStart, pos, fLookLimit, c);
2581 if (u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND) || u_charType(c) == U_FORMAT_CHAR) {
2582 // Current char is a combining one. Not a boundary.
2583 return FALSE;
2584 }
2585 cIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(c);
2586 }
2587
2588 // Back up until we come to a non-combining char, determine whether
2589 // that char is a word char.
2590 UBool prevCIsWord = FALSE;
2591 for (;;) {
2592 if (pos <= fLookStart) {
2593 break;
2594 }
2595 UChar32 prevChar;
2596 U16_PREV(inputBuf, fLookStart, pos, prevChar);
2597 if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND)
2598 || u_charType(prevChar) == U_FORMAT_CHAR)) {
2599 prevCIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(prevChar);
2600 break;
2601 }
2602 }
2603 isBoundary = cIsWord ^ prevCIsWord;
2604 return isBoundary;
2605 }
2606
2607 //--------------------------------------------------------------------------------
2608 //
2609 // isUWordBoundary
2610 //
2611 // Test for a word boundary using RBBI word break.
2612 //
2613 // parameters: pos - the current position in the input buffer
2614 //
2615 //--------------------------------------------------------------------------------
isUWordBoundary(int64_t pos)2616 UBool RegexMatcher::isUWordBoundary(int64_t pos) {
2617 UBool returnVal = FALSE;
2618 #if UCONFIG_NO_BREAK_ITERATION==0
2619
2620 // If we haven't yet created a break iterator for this matcher, do it now.
2621 if (fWordBreakItr == NULL) {
2622 fWordBreakItr =
2623 (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), fDeferredStatus);
2624 if (U_FAILURE(fDeferredStatus)) {
2625 return FALSE;
2626 }
2627 fWordBreakItr->setText(fInputText, fDeferredStatus);
2628 }
2629
2630 if (pos >= fLookLimit) {
2631 fHitEnd = TRUE;
2632 returnVal = TRUE; // With Unicode word rules, only positions within the interior of "real"
2633 // words are not boundaries. All non-word chars stand by themselves,
2634 // with word boundaries on both sides.
2635 } else {
2636 if (!UTEXT_USES_U16(fInputText)) {
2637 // !!!: Would like a better way to do this!
2638 UErrorCode status = U_ZERO_ERROR;
2639 pos = utext_extract(fInputText, 0, pos, NULL, 0, &status);
2640 }
2641 returnVal = fWordBreakItr->isBoundary((int32_t)pos);
2642 }
2643 #endif
2644 return returnVal;
2645 }
2646
2647 //--------------------------------------------------------------------------------
2648 //
2649 // IncrementTime This function is called once each TIMER_INITIAL_VALUE state
2650 // saves. Increment the "time" counter, and call the
2651 // user callback function if there is one installed.
2652 //
2653 // If the match operation needs to be aborted, either for a time-out
2654 // or because the user callback asked for it, just set an error status.
2655 // The engine will pick that up and stop in its outer loop.
2656 //
2657 //--------------------------------------------------------------------------------
IncrementTime(UErrorCode & status)2658 void RegexMatcher::IncrementTime(UErrorCode &status) {
2659 fTickCounter = TIMER_INITIAL_VALUE;
2660 fTime++;
2661 if (fCallbackFn != NULL) {
2662 if ((*fCallbackFn)(fCallbackContext, fTime) == FALSE) {
2663 status = U_REGEX_STOPPED_BY_CALLER;
2664 return;
2665 }
2666 }
2667 if (fTimeLimit > 0 && fTime >= fTimeLimit) {
2668 status = U_REGEX_TIME_OUT;
2669 }
2670 }
2671
2672 //--------------------------------------------------------------------------------
2673 //
2674 // StateSave
2675 // Make a new stack frame, initialized as a copy of the current stack frame.
2676 // Set the pattern index in the original stack frame from the operand value
2677 // in the opcode. Execution of the engine continues with the state in
2678 // the newly created stack frame
2679 //
2680 // Note that reserveBlock() may grow the stack, resulting in the
2681 // whole thing being relocated in memory.
2682 //
2683 // Parameters:
2684 // fp The top frame pointer when called. At return, a new
2685 // fame will be present
2686 // savePatIdx An index into the compiled pattern. Goes into the original
2687 // (not new) frame. If execution ever back-tracks out of the
2688 // new frame, this will be where we continue from in the pattern.
2689 // Return
2690 // The new frame pointer.
2691 //
2692 //--------------------------------------------------------------------------------
StateSave(REStackFrame * fp,int64_t savePatIdx,UErrorCode & status)2693 inline REStackFrame *RegexMatcher::StateSave(REStackFrame *fp, int64_t savePatIdx, UErrorCode &status) {
2694 if (U_FAILURE(status)) {
2695 return fp;
2696 }
2697 // push storage for a new frame.
2698 int64_t *newFP = fStack->reserveBlock(fFrameSize, status);
2699 if (U_FAILURE(status)) {
2700 // Failure on attempted stack expansion.
2701 // Stack function set some other error code, change it to a more
2702 // specific one for regular expressions.
2703 status = U_REGEX_STACK_OVERFLOW;
2704 // We need to return a writable stack frame, so just return the
2705 // previous frame. The match operation will stop quickly
2706 // because of the error status, after which the frame will never
2707 // be looked at again.
2708 return fp;
2709 }
2710 fp = (REStackFrame *)(newFP - fFrameSize); // in case of realloc of stack.
2711
2712 // New stack frame = copy of old top frame.
2713 int64_t *source = (int64_t *)fp;
2714 int64_t *dest = newFP;
2715 for (;;) {
2716 *dest++ = *source++;
2717 if (source == newFP) {
2718 break;
2719 }
2720 }
2721
2722 fTickCounter--;
2723 if (fTickCounter <= 0) {
2724 IncrementTime(status); // Re-initializes fTickCounter
2725 }
2726 fp->fPatIdx = savePatIdx;
2727 return (REStackFrame *)newFP;
2728 }
2729
2730 #if defined(REGEX_DEBUG)
2731 namespace {
StringFromUText(UText * ut)2732 UnicodeString StringFromUText(UText *ut) {
2733 UnicodeString result;
2734 for (UChar32 c = utext_next32From(ut, 0); c != U_SENTINEL; c = UTEXT_NEXT32(ut)) {
2735 result.append(c);
2736 }
2737 return result;
2738 }
2739 }
2740 #endif // REGEX_DEBUG
2741
2742
2743 //--------------------------------------------------------------------------------
2744 //
2745 // MatchAt This is the actual matching engine.
2746 //
2747 // startIdx: begin matching a this index.
2748 // toEnd: if true, match must extend to end of the input region
2749 //
2750 //--------------------------------------------------------------------------------
MatchAt(int64_t startIdx,UBool toEnd,UErrorCode & status)2751 void RegexMatcher::MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status) {
2752 UBool isMatch = FALSE; // True if the we have a match.
2753
2754 int64_t backSearchIndex = U_INT64_MAX; // used after greedy single-character matches for searching backwards
2755
2756 int32_t op; // Operation from the compiled pattern, split into
2757 int32_t opType; // the opcode
2758 int32_t opValue; // and the operand value.
2759
2760 #ifdef REGEX_RUN_DEBUG
2761 if (fTraceDebug) {
2762 printf("MatchAt(startIdx=%ld)\n", startIdx);
2763 printf("Original Pattern: \"%s\"\n", CStr(StringFromUText(fPattern->fPattern))());
2764 printf("Input String: \"%s\"\n\n", CStr(StringFromUText(fInputText))());
2765 }
2766 #endif
2767
2768 if (U_FAILURE(status)) {
2769 return;
2770 }
2771
2772 // Cache frequently referenced items from the compiled pattern
2773 //
2774 int64_t *pat = fPattern->fCompiledPat->getBuffer();
2775
2776 const UChar *litText = fPattern->fLiteralText.getBuffer();
2777 UVector *sets = fPattern->fSets;
2778
2779 fFrameSize = fPattern->fFrameSize;
2780 REStackFrame *fp = resetStack();
2781 if (U_FAILURE(fDeferredStatus)) {
2782 status = fDeferredStatus;
2783 return;
2784 }
2785
2786 fp->fPatIdx = 0;
2787 fp->fInputIdx = startIdx;
2788
2789 // Zero out the pattern's static data
2790 int32_t i;
2791 for (i = 0; i<fPattern->fDataSize; i++) {
2792 fData[i] = 0;
2793 }
2794
2795 //
2796 // Main loop for interpreting the compiled pattern.
2797 // One iteration of the loop per pattern operation performed.
2798 //
2799 for (;;) {
2800 op = (int32_t)pat[fp->fPatIdx];
2801 opType = URX_TYPE(op);
2802 opValue = URX_VAL(op);
2803 #ifdef REGEX_RUN_DEBUG
2804 if (fTraceDebug) {
2805 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2806 printf("inputIdx=%ld inputChar=%x sp=%3ld activeLimit=%ld ", fp->fInputIdx,
2807 UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(), fActiveLimit);
2808 fPattern->dumpOp(fp->fPatIdx);
2809 }
2810 #endif
2811 fp->fPatIdx++;
2812
2813 switch (opType) {
2814
2815
2816 case URX_NOP:
2817 break;
2818
2819
2820 case URX_BACKTRACK:
2821 // Force a backtrack. In some circumstances, the pattern compiler
2822 // will notice that the pattern can't possibly match anything, and will
2823 // emit one of these at that point.
2824 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2825 break;
2826
2827
2828 case URX_ONECHAR:
2829 if (fp->fInputIdx < fActiveLimit) {
2830 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2831 UChar32 c = UTEXT_NEXT32(fInputText);
2832 if (c == opValue) {
2833 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
2834 break;
2835 }
2836 } else {
2837 fHitEnd = TRUE;
2838 }
2839 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2840 break;
2841
2842
2843 case URX_STRING:
2844 {
2845 // Test input against a literal string.
2846 // Strings require two slots in the compiled pattern, one for the
2847 // offset to the string text, and one for the length.
2848
2849 int32_t stringStartIdx = opValue;
2850 op = (int32_t)pat[fp->fPatIdx]; // Fetch the second operand
2851 fp->fPatIdx++;
2852 opType = URX_TYPE(op);
2853 int32_t stringLen = URX_VAL(op);
2854 U_ASSERT(opType == URX_STRING_LEN);
2855 U_ASSERT(stringLen >= 2);
2856
2857 const UChar *patternString = litText+stringStartIdx;
2858 int32_t patternStringIndex = 0;
2859 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2860 UChar32 inputChar;
2861 UChar32 patternChar;
2862 UBool success = TRUE;
2863 while (patternStringIndex < stringLen) {
2864 if (UTEXT_GETNATIVEINDEX(fInputText) >= fActiveLimit) {
2865 success = FALSE;
2866 fHitEnd = TRUE;
2867 break;
2868 }
2869 inputChar = UTEXT_NEXT32(fInputText);
2870 U16_NEXT(patternString, patternStringIndex, stringLen, patternChar);
2871 if (patternChar != inputChar) {
2872 success = FALSE;
2873 break;
2874 }
2875 }
2876
2877 if (success) {
2878 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
2879 } else {
2880 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2881 }
2882 }
2883 break;
2884
2885
2886 case URX_STATE_SAVE:
2887 fp = StateSave(fp, opValue, status);
2888 break;
2889
2890
2891 case URX_END:
2892 // The match loop will exit via this path on a successful match,
2893 // when we reach the end of the pattern.
2894 if (toEnd && fp->fInputIdx != fActiveLimit) {
2895 // The pattern matched, but not to the end of input. Try some more.
2896 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2897 break;
2898 }
2899 isMatch = TRUE;
2900 goto breakFromLoop;
2901
2902 // Start and End Capture stack frame variables are laid out out like this:
2903 // fp->fExtra[opValue] - The start of a completed capture group
2904 // opValue+1 - The end of a completed capture group
2905 // opValue+2 - the start of a capture group whose end
2906 // has not yet been reached (and might not ever be).
2907 case URX_START_CAPTURE:
2908 U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
2909 fp->fExtra[opValue+2] = fp->fInputIdx;
2910 break;
2911
2912
2913 case URX_END_CAPTURE:
2914 U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
2915 U_ASSERT(fp->fExtra[opValue+2] >= 0); // Start pos for this group must be set.
2916 fp->fExtra[opValue] = fp->fExtra[opValue+2]; // Tentative start becomes real.
2917 fp->fExtra[opValue+1] = fp->fInputIdx; // End position
2918 U_ASSERT(fp->fExtra[opValue] <= fp->fExtra[opValue+1]);
2919 break;
2920
2921
2922 case URX_DOLLAR: // $, test for End of line
2923 // or for position before new line at end of input
2924 {
2925 if (fp->fInputIdx >= fAnchorLimit) {
2926 // We really are at the end of input. Success.
2927 fHitEnd = TRUE;
2928 fRequireEnd = TRUE;
2929 break;
2930 }
2931
2932 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2933
2934 // If we are positioned just before a new-line that is located at the
2935 // end of input, succeed.
2936 UChar32 c = UTEXT_NEXT32(fInputText);
2937 if (UTEXT_GETNATIVEINDEX(fInputText) >= fAnchorLimit) {
2938 if (isLineTerminator(c)) {
2939 // If not in the middle of a CR/LF sequence
2940 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && ((void)UTEXT_PREVIOUS32(fInputText), UTEXT_PREVIOUS32(fInputText))==0x0d)) {
2941 // At new-line at end of input. Success
2942 fHitEnd = TRUE;
2943 fRequireEnd = TRUE;
2944
2945 break;
2946 }
2947 }
2948 } else {
2949 UChar32 nextC = UTEXT_NEXT32(fInputText);
2950 if (c == 0x0d && nextC == 0x0a && UTEXT_GETNATIVEINDEX(fInputText) >= fAnchorLimit) {
2951 fHitEnd = TRUE;
2952 fRequireEnd = TRUE;
2953 break; // At CR/LF at end of input. Success
2954 }
2955 }
2956
2957 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2958 }
2959 break;
2960
2961
2962 case URX_DOLLAR_D: // $, test for End of Line, in UNIX_LINES mode.
2963 if (fp->fInputIdx >= fAnchorLimit) {
2964 // Off the end of input. Success.
2965 fHitEnd = TRUE;
2966 fRequireEnd = TRUE;
2967 break;
2968 } else {
2969 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2970 UChar32 c = UTEXT_NEXT32(fInputText);
2971 // Either at the last character of input, or off the end.
2972 if (c == 0x0a && UTEXT_GETNATIVEINDEX(fInputText) == fAnchorLimit) {
2973 fHitEnd = TRUE;
2974 fRequireEnd = TRUE;
2975 break;
2976 }
2977 }
2978
2979 // Not at end of input. Back-track out.
2980 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2981 break;
2982
2983
2984 case URX_DOLLAR_M: // $, test for End of line in multi-line mode
2985 {
2986 if (fp->fInputIdx >= fAnchorLimit) {
2987 // We really are at the end of input. Success.
2988 fHitEnd = TRUE;
2989 fRequireEnd = TRUE;
2990 break;
2991 }
2992 // If we are positioned just before a new-line, succeed.
2993 // It makes no difference where the new-line is within the input.
2994 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2995 UChar32 c = UTEXT_CURRENT32(fInputText);
2996 if (isLineTerminator(c)) {
2997 // At a line end, except for the odd chance of being in the middle of a CR/LF sequence
2998 // In multi-line mode, hitting a new-line just before the end of input does not
2999 // set the hitEnd or requireEnd flags
3000 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && UTEXT_PREVIOUS32(fInputText)==0x0d)) {
3001 break;
3002 }
3003 }
3004 // not at a new line. Fail.
3005 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3006 }
3007 break;
3008
3009
3010 case URX_DOLLAR_MD: // $, test for End of line in multi-line and UNIX_LINES mode
3011 {
3012 if (fp->fInputIdx >= fAnchorLimit) {
3013 // We really are at the end of input. Success.
3014 fHitEnd = TRUE;
3015 fRequireEnd = TRUE; // Java set requireEnd in this case, even though
3016 break; // adding a new-line would not lose the match.
3017 }
3018 // If we are not positioned just before a new-line, the test fails; backtrack out.
3019 // It makes no difference where the new-line is within the input.
3020 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3021 if (UTEXT_CURRENT32(fInputText) != 0x0a) {
3022 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3023 }
3024 }
3025 break;
3026
3027
3028 case URX_CARET: // ^, test for start of line
3029 if (fp->fInputIdx != fAnchorStart) {
3030 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3031 }
3032 break;
3033
3034
3035 case URX_CARET_M: // ^, test for start of line in mulit-line mode
3036 {
3037 if (fp->fInputIdx == fAnchorStart) {
3038 // We are at the start input. Success.
3039 break;
3040 }
3041 // Check whether character just before the current pos is a new-line
3042 // unless we are at the end of input
3043 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3044 UChar32 c = UTEXT_PREVIOUS32(fInputText);
3045 if ((fp->fInputIdx < fAnchorLimit) && isLineTerminator(c)) {
3046 // It's a new-line. ^ is true. Success.
3047 // TODO: what should be done with positions between a CR and LF?
3048 break;
3049 }
3050 // Not at the start of a line. Fail.
3051 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3052 }
3053 break;
3054
3055
3056 case URX_CARET_M_UNIX: // ^, test for start of line in mulit-line + Unix-line mode
3057 {
3058 U_ASSERT(fp->fInputIdx >= fAnchorStart);
3059 if (fp->fInputIdx <= fAnchorStart) {
3060 // We are at the start input. Success.
3061 break;
3062 }
3063 // Check whether character just before the current pos is a new-line
3064 U_ASSERT(fp->fInputIdx <= fAnchorLimit);
3065 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3066 UChar32 c = UTEXT_PREVIOUS32(fInputText);
3067 if (c != 0x0a) {
3068 // Not at the start of a line. Back-track out.
3069 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3070 }
3071 }
3072 break;
3073
3074 case URX_BACKSLASH_B: // Test for word boundaries
3075 {
3076 UBool success = isWordBoundary(fp->fInputIdx);
3077 success ^= (UBool)(opValue != 0); // flip sense for \B
3078 if (!success) {
3079 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3080 }
3081 }
3082 break;
3083
3084
3085 case URX_BACKSLASH_BU: // Test for word boundaries, Unicode-style
3086 {
3087 UBool success = isUWordBoundary(fp->fInputIdx);
3088 success ^= (UBool)(opValue != 0); // flip sense for \B
3089 if (!success) {
3090 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3091 }
3092 }
3093 break;
3094
3095
3096 case URX_BACKSLASH_D: // Test for decimal digit
3097 {
3098 if (fp->fInputIdx >= fActiveLimit) {
3099 fHitEnd = TRUE;
3100 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3101 break;
3102 }
3103
3104 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3105
3106 UChar32 c = UTEXT_NEXT32(fInputText);
3107 int8_t ctype = u_charType(c); // TODO: make a unicode set for this. Will be faster.
3108 UBool success = (ctype == U_DECIMAL_DIGIT_NUMBER);
3109 success ^= (UBool)(opValue != 0); // flip sense for \D
3110 if (success) {
3111 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3112 } else {
3113 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3114 }
3115 }
3116 break;
3117
3118
3119 case URX_BACKSLASH_G: // Test for position at end of previous match
3120 if (!((fMatch && fp->fInputIdx==fMatchEnd) || (fMatch==FALSE && fp->fInputIdx==fActiveStart))) {
3121 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3122 }
3123 break;
3124
3125
3126 case URX_BACKSLASH_H: // Test for \h, horizontal white space.
3127 {
3128 if (fp->fInputIdx >= fActiveLimit) {
3129 fHitEnd = TRUE;
3130 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3131 break;
3132 }
3133 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3134 UChar32 c = UTEXT_NEXT32(fInputText);
3135 int8_t ctype = u_charType(c);
3136 UBool success = (ctype == U_SPACE_SEPARATOR || c == 9); // SPACE_SEPARATOR || TAB
3137 success ^= (UBool)(opValue != 0); // flip sense for \H
3138 if (success) {
3139 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3140 } else {
3141 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3142 }
3143 }
3144 break;
3145
3146
3147 case URX_BACKSLASH_R: // Test for \R, any line break sequence.
3148 {
3149 if (fp->fInputIdx >= fActiveLimit) {
3150 fHitEnd = TRUE;
3151 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3152 break;
3153 }
3154 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3155 UChar32 c = UTEXT_NEXT32(fInputText);
3156 if (isLineTerminator(c)) {
3157 if (c == 0x0d && utext_current32(fInputText) == 0x0a) {
3158 utext_next32(fInputText);
3159 }
3160 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3161 } else {
3162 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3163 }
3164 }
3165 break;
3166
3167
3168 case URX_BACKSLASH_V: // \v, any single line ending character.
3169 {
3170 if (fp->fInputIdx >= fActiveLimit) {
3171 fHitEnd = TRUE;
3172 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3173 break;
3174 }
3175 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3176 UChar32 c = UTEXT_NEXT32(fInputText);
3177 UBool success = isLineTerminator(c);
3178 success ^= (UBool)(opValue != 0); // flip sense for \V
3179 if (success) {
3180 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3181 } else {
3182 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3183 }
3184 }
3185 break;
3186
3187
3188 case URX_BACKSLASH_X:
3189 // Match a Grapheme, as defined by Unicode TR 29.
3190 // Differs slightly from Perl, which consumes combining marks independently
3191 // of context.
3192 {
3193
3194 // Fail if at end of input
3195 if (fp->fInputIdx >= fActiveLimit) {
3196 fHitEnd = TRUE;
3197 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3198 break;
3199 }
3200
3201 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3202
3203 // Examine (and consume) the current char.
3204 // Dispatch into a little state machine, based on the char.
3205 UChar32 c;
3206 c = UTEXT_NEXT32(fInputText);
3207 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3208 UnicodeSet **sets = fPattern->fStaticSets;
3209 if (sets[URX_GC_NORMAL]->contains(c)) goto GC_Extend;
3210 if (sets[URX_GC_CONTROL]->contains(c)) goto GC_Control;
3211 if (sets[URX_GC_L]->contains(c)) goto GC_L;
3212 if (sets[URX_GC_LV]->contains(c)) goto GC_V;
3213 if (sets[URX_GC_LVT]->contains(c)) goto GC_T;
3214 if (sets[URX_GC_V]->contains(c)) goto GC_V;
3215 if (sets[URX_GC_T]->contains(c)) goto GC_T;
3216 goto GC_Extend;
3217
3218
3219
3220 GC_L:
3221 if (fp->fInputIdx >= fActiveLimit) goto GC_Done;
3222 c = UTEXT_NEXT32(fInputText);
3223 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3224 if (sets[URX_GC_L]->contains(c)) goto GC_L;
3225 if (sets[URX_GC_LV]->contains(c)) goto GC_V;
3226 if (sets[URX_GC_LVT]->contains(c)) goto GC_T;
3227 if (sets[URX_GC_V]->contains(c)) goto GC_V;
3228 (void)UTEXT_PREVIOUS32(fInputText);
3229 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3230 goto GC_Extend;
3231
3232 GC_V:
3233 if (fp->fInputIdx >= fActiveLimit) goto GC_Done;
3234 c = UTEXT_NEXT32(fInputText);
3235 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3236 if (sets[URX_GC_V]->contains(c)) goto GC_V;
3237 if (sets[URX_GC_T]->contains(c)) goto GC_T;
3238 (void)UTEXT_PREVIOUS32(fInputText);
3239 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3240 goto GC_Extend;
3241
3242 GC_T:
3243 if (fp->fInputIdx >= fActiveLimit) goto GC_Done;
3244 c = UTEXT_NEXT32(fInputText);
3245 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3246 if (sets[URX_GC_T]->contains(c)) goto GC_T;
3247 (void)UTEXT_PREVIOUS32(fInputText);
3248 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3249 goto GC_Extend;
3250
3251 GC_Extend:
3252 // Combining characters are consumed here
3253 for (;;) {
3254 if (fp->fInputIdx >= fActiveLimit) {
3255 break;
3256 }
3257 c = UTEXT_CURRENT32(fInputText);
3258 if (sets[URX_GC_EXTEND]->contains(c) == FALSE) {
3259 break;
3260 }
3261 (void)UTEXT_NEXT32(fInputText);
3262 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3263 }
3264 goto GC_Done;
3265
3266 GC_Control:
3267 // Most control chars stand alone (don't combine with combining chars),
3268 // except for that CR/LF sequence is a single grapheme cluster.
3269 if (c == 0x0d && fp->fInputIdx < fActiveLimit && UTEXT_CURRENT32(fInputText) == 0x0a) {
3270 c = UTEXT_NEXT32(fInputText);
3271 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3272 }
3273
3274 GC_Done:
3275 if (fp->fInputIdx >= fActiveLimit) {
3276 fHitEnd = TRUE;
3277 }
3278 break;
3279 }
3280
3281
3282
3283
3284 case URX_BACKSLASH_Z: // Test for end of Input
3285 if (fp->fInputIdx < fAnchorLimit) {
3286 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3287 } else {
3288 fHitEnd = TRUE;
3289 fRequireEnd = TRUE;
3290 }
3291 break;
3292
3293
3294
3295 case URX_STATIC_SETREF:
3296 {
3297 // Test input character against one of the predefined sets
3298 // (Word Characters, for example)
3299 // The high bit of the op value is a flag for the match polarity.
3300 // 0: success if input char is in set.
3301 // 1: success if input char is not in set.
3302 if (fp->fInputIdx >= fActiveLimit) {
3303 fHitEnd = TRUE;
3304 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3305 break;
3306 }
3307
3308 UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET);
3309 opValue &= ~URX_NEG_SET;
3310 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
3311
3312 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3313 UChar32 c = UTEXT_NEXT32(fInputText);
3314 if (c < 256) {
3315 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
3316 if (s8->contains(c)) {
3317 success = !success;
3318 }
3319 } else {
3320 const UnicodeSet *s = fPattern->fStaticSets[opValue];
3321 if (s->contains(c)) {
3322 success = !success;
3323 }
3324 }
3325 if (success) {
3326 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3327 } else {
3328 // the character wasn't in the set.
3329 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3330 }
3331 }
3332 break;
3333
3334
3335 case URX_STAT_SETREF_N:
3336 {
3337 // Test input character for NOT being a member of one of
3338 // the predefined sets (Word Characters, for example)
3339 if (fp->fInputIdx >= fActiveLimit) {
3340 fHitEnd = TRUE;
3341 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3342 break;
3343 }
3344
3345 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
3346
3347 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3348
3349 UChar32 c = UTEXT_NEXT32(fInputText);
3350 if (c < 256) {
3351 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
3352 if (s8->contains(c) == FALSE) {
3353 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3354 break;
3355 }
3356 } else {
3357 const UnicodeSet *s = fPattern->fStaticSets[opValue];
3358 if (s->contains(c) == FALSE) {
3359 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3360 break;
3361 }
3362 }
3363 // the character wasn't in the set.
3364 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3365 }
3366 break;
3367
3368
3369 case URX_SETREF:
3370 if (fp->fInputIdx >= fActiveLimit) {
3371 fHitEnd = TRUE;
3372 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3373 break;
3374 } else {
3375 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3376
3377 // There is input left. Pick up one char and test it for set membership.
3378 UChar32 c = UTEXT_NEXT32(fInputText);
3379 U_ASSERT(opValue > 0 && opValue < sets->size());
3380 if (c<256) {
3381 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
3382 if (s8->contains(c)) {
3383 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3384 break;
3385 }
3386 } else {
3387 UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue);
3388 if (s->contains(c)) {
3389 // The character is in the set. A Match.
3390 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3391 break;
3392 }
3393 }
3394
3395 // the character wasn't in the set.
3396 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3397 }
3398 break;
3399
3400
3401 case URX_DOTANY:
3402 {
3403 // . matches anything, but stops at end-of-line.
3404 if (fp->fInputIdx >= fActiveLimit) {
3405 // At end of input. Match failed. Backtrack out.
3406 fHitEnd = TRUE;
3407 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3408 break;
3409 }
3410
3411 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3412
3413 // There is input left. Advance over one char, unless we've hit end-of-line
3414 UChar32 c = UTEXT_NEXT32(fInputText);
3415 if (isLineTerminator(c)) {
3416 // End of line in normal mode. . does not match.
3417 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3418 break;
3419 }
3420 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3421 }
3422 break;
3423
3424
3425 case URX_DOTANY_ALL:
3426 {
3427 // ., in dot-matches-all (including new lines) mode
3428 if (fp->fInputIdx >= fActiveLimit) {
3429 // At end of input. Match failed. Backtrack out.
3430 fHitEnd = TRUE;
3431 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3432 break;
3433 }
3434
3435 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3436
3437 // There is input left. Advance over one char, except if we are
3438 // at a cr/lf, advance over both of them.
3439 UChar32 c;
3440 c = UTEXT_NEXT32(fInputText);
3441 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3442 if (c==0x0d && fp->fInputIdx < fActiveLimit) {
3443 // In the case of a CR/LF, we need to advance over both.
3444 UChar32 nextc = UTEXT_CURRENT32(fInputText);
3445 if (nextc == 0x0a) {
3446 (void)UTEXT_NEXT32(fInputText);
3447 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3448 }
3449 }
3450 }
3451 break;
3452
3453
3454 case URX_DOTANY_UNIX:
3455 {
3456 // '.' operator, matches all, but stops at end-of-line.
3457 // UNIX_LINES mode, so 0x0a is the only recognized line ending.
3458 if (fp->fInputIdx >= fActiveLimit) {
3459 // At end of input. Match failed. Backtrack out.
3460 fHitEnd = TRUE;
3461 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3462 break;
3463 }
3464
3465 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3466
3467 // There is input left. Advance over one char, unless we've hit end-of-line
3468 UChar32 c = UTEXT_NEXT32(fInputText);
3469 if (c == 0x0a) {
3470 // End of line in normal mode. '.' does not match the \n
3471 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3472 } else {
3473 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3474 }
3475 }
3476 break;
3477
3478
3479 case URX_JMP:
3480 fp->fPatIdx = opValue;
3481 break;
3482
3483 case URX_FAIL:
3484 isMatch = FALSE;
3485 goto breakFromLoop;
3486
3487 case URX_JMP_SAV:
3488 U_ASSERT(opValue < fPattern->fCompiledPat->size());
3489 fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current
3490 fp->fPatIdx = opValue; // Then JMP.
3491 break;
3492
3493 case URX_JMP_SAV_X:
3494 // This opcode is used with (x)+, when x can match a zero length string.
3495 // Same as JMP_SAV, except conditional on the match having made forward progress.
3496 // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the
3497 // data address of the input position at the start of the loop.
3498 {
3499 U_ASSERT(opValue > 0 && opValue < fPattern->fCompiledPat->size());
3500 int32_t stoOp = (int32_t)pat[opValue-1];
3501 U_ASSERT(URX_TYPE(stoOp) == URX_STO_INP_LOC);
3502 int32_t frameLoc = URX_VAL(stoOp);
3503 U_ASSERT(frameLoc >= 0 && frameLoc < fFrameSize);
3504 int64_t prevInputIdx = fp->fExtra[frameLoc];
3505 U_ASSERT(prevInputIdx <= fp->fInputIdx);
3506 if (prevInputIdx < fp->fInputIdx) {
3507 // The match did make progress. Repeat the loop.
3508 fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current
3509 fp->fPatIdx = opValue;
3510 fp->fExtra[frameLoc] = fp->fInputIdx;
3511 }
3512 // If the input position did not advance, we do nothing here,
3513 // execution will fall out of the loop.
3514 }
3515 break;
3516
3517 case URX_CTR_INIT:
3518 {
3519 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
3520 fp->fExtra[opValue] = 0; // Set the loop counter variable to zero
3521
3522 // Pick up the three extra operands that CTR_INIT has, and
3523 // skip the pattern location counter past
3524 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
3525 fp->fPatIdx += 3;
3526 int32_t loopLoc = URX_VAL(pat[instrOperandLoc]);
3527 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
3528 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
3529 U_ASSERT(minCount>=0);
3530 U_ASSERT(maxCount>=minCount || maxCount==-1);
3531 U_ASSERT(loopLoc>=fp->fPatIdx);
3532
3533 if (minCount == 0) {
3534 fp = StateSave(fp, loopLoc+1, status);
3535 }
3536 if (maxCount == -1) {
3537 fp->fExtra[opValue+1] = fp->fInputIdx; // For loop breaking.
3538 } else if (maxCount == 0) {
3539 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3540 }
3541 }
3542 break;
3543
3544 case URX_CTR_LOOP:
3545 {
3546 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
3547 int32_t initOp = (int32_t)pat[opValue];
3548 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT);
3549 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
3550 int32_t minCount = (int32_t)pat[opValue+2];
3551 int32_t maxCount = (int32_t)pat[opValue+3];
3552 (*pCounter)++;
3553 if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) {
3554 U_ASSERT(*pCounter == maxCount);
3555 break;
3556 }
3557 if (*pCounter >= minCount) {
3558 if (maxCount == -1) {
3559 // Loop has no hard upper bound.
3560 // Check that it is progressing through the input, break if it is not.
3561 int64_t *pLastInputIdx = &fp->fExtra[URX_VAL(initOp) + 1];
3562 if (fp->fInputIdx == *pLastInputIdx) {
3563 break;
3564 } else {
3565 *pLastInputIdx = fp->fInputIdx;
3566 }
3567 }
3568 fp = StateSave(fp, fp->fPatIdx, status);
3569 }
3570 fp->fPatIdx = opValue + 4; // Loop back.
3571 }
3572 break;
3573
3574 case URX_CTR_INIT_NG:
3575 {
3576 // Initialize a non-greedy loop
3577 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
3578 fp->fExtra[opValue] = 0; // Set the loop counter variable to zero
3579
3580 // Pick up the three extra operands that CTR_INIT_NG has, and
3581 // skip the pattern location counter past
3582 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
3583 fp->fPatIdx += 3;
3584 int32_t loopLoc = URX_VAL(pat[instrOperandLoc]);
3585 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
3586 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
3587 U_ASSERT(minCount>=0);
3588 U_ASSERT(maxCount>=minCount || maxCount==-1);
3589 U_ASSERT(loopLoc>fp->fPatIdx);
3590 if (maxCount == -1) {
3591 fp->fExtra[opValue+1] = fp->fInputIdx; // Save initial input index for loop breaking.
3592 }
3593
3594 if (minCount == 0) {
3595 if (maxCount != 0) {
3596 fp = StateSave(fp, fp->fPatIdx, status);
3597 }
3598 fp->fPatIdx = loopLoc+1; // Continue with stuff after repeated block
3599 }
3600 }
3601 break;
3602
3603 case URX_CTR_LOOP_NG:
3604 {
3605 // Non-greedy {min, max} loops
3606 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
3607 int32_t initOp = (int32_t)pat[opValue];
3608 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT_NG);
3609 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
3610 int32_t minCount = (int32_t)pat[opValue+2];
3611 int32_t maxCount = (int32_t)pat[opValue+3];
3612
3613 (*pCounter)++;
3614 if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) {
3615 // The loop has matched the maximum permitted number of times.
3616 // Break out of here with no action. Matching will
3617 // continue with the following pattern.
3618 U_ASSERT(*pCounter == maxCount);
3619 break;
3620 }
3621
3622 if (*pCounter < minCount) {
3623 // We haven't met the minimum number of matches yet.
3624 // Loop back for another one.
3625 fp->fPatIdx = opValue + 4; // Loop back.
3626 } else {
3627 // We do have the minimum number of matches.
3628
3629 // If there is no upper bound on the loop iterations, check that the input index
3630 // is progressing, and stop the loop if it is not.
3631 if (maxCount == -1) {
3632 int64_t *pLastInputIdx = &fp->fExtra[URX_VAL(initOp) + 1];
3633 if (fp->fInputIdx == *pLastInputIdx) {
3634 break;
3635 }
3636 *pLastInputIdx = fp->fInputIdx;
3637 }
3638
3639 // Loop Continuation: we will fall into the pattern following the loop
3640 // (non-greedy, don't execute loop body first), but first do
3641 // a state save to the top of the loop, so that a match failure
3642 // in the following pattern will try another iteration of the loop.
3643 fp = StateSave(fp, opValue + 4, status);
3644 }
3645 }
3646 break;
3647
3648 case URX_STO_SP:
3649 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
3650 fData[opValue] = fStack->size();
3651 break;
3652
3653 case URX_LD_SP:
3654 {
3655 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
3656 int32_t newStackSize = (int32_t)fData[opValue];
3657 U_ASSERT(newStackSize <= fStack->size());
3658 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
3659 if (newFP == (int64_t *)fp) {
3660 break;
3661 }
3662 int32_t i;
3663 for (i=0; i<fFrameSize; i++) {
3664 newFP[i] = ((int64_t *)fp)[i];
3665 }
3666 fp = (REStackFrame *)newFP;
3667 fStack->setSize(newStackSize);
3668 }
3669 break;
3670
3671 case URX_BACKREF:
3672 {
3673 U_ASSERT(opValue < fFrameSize);
3674 int64_t groupStartIdx = fp->fExtra[opValue];
3675 int64_t groupEndIdx = fp->fExtra[opValue+1];
3676 U_ASSERT(groupStartIdx <= groupEndIdx);
3677 if (groupStartIdx < 0) {
3678 // This capture group has not participated in the match thus far,
3679 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match.
3680 break;
3681 }
3682 UTEXT_SETNATIVEINDEX(fAltInputText, groupStartIdx);
3683 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3684
3685 // Note: if the capture group match was of an empty string the backref
3686 // match succeeds. Verified by testing: Perl matches succeed
3687 // in this case, so we do too.
3688
3689 UBool success = TRUE;
3690 for (;;) {
3691 if (utext_getNativeIndex(fAltInputText) >= groupEndIdx) {
3692 success = TRUE;
3693 break;
3694 }
3695 if (utext_getNativeIndex(fInputText) >= fActiveLimit) {
3696 success = FALSE;
3697 fHitEnd = TRUE;
3698 break;
3699 }
3700 UChar32 captureGroupChar = utext_next32(fAltInputText);
3701 UChar32 inputChar = utext_next32(fInputText);
3702 if (inputChar != captureGroupChar) {
3703 success = FALSE;
3704 break;
3705 }
3706 }
3707
3708 if (success) {
3709 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3710 } else {
3711 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3712 }
3713 }
3714 break;
3715
3716
3717
3718 case URX_BACKREF_I:
3719 {
3720 U_ASSERT(opValue < fFrameSize);
3721 int64_t groupStartIdx = fp->fExtra[opValue];
3722 int64_t groupEndIdx = fp->fExtra[opValue+1];
3723 U_ASSERT(groupStartIdx <= groupEndIdx);
3724 if (groupStartIdx < 0) {
3725 // This capture group has not participated in the match thus far,
3726 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match.
3727 break;
3728 }
3729 utext_setNativeIndex(fAltInputText, groupStartIdx);
3730 utext_setNativeIndex(fInputText, fp->fInputIdx);
3731 CaseFoldingUTextIterator captureGroupItr(*fAltInputText);
3732 CaseFoldingUTextIterator inputItr(*fInputText);
3733
3734 // Note: if the capture group match was of an empty string the backref
3735 // match succeeds. Verified by testing: Perl matches succeed
3736 // in this case, so we do too.
3737
3738 UBool success = TRUE;
3739 for (;;) {
3740 if (!captureGroupItr.inExpansion() && utext_getNativeIndex(fAltInputText) >= groupEndIdx) {
3741 success = TRUE;
3742 break;
3743 }
3744 if (!inputItr.inExpansion() && utext_getNativeIndex(fInputText) >= fActiveLimit) {
3745 success = FALSE;
3746 fHitEnd = TRUE;
3747 break;
3748 }
3749 UChar32 captureGroupChar = captureGroupItr.next();
3750 UChar32 inputChar = inputItr.next();
3751 if (inputChar != captureGroupChar) {
3752 success = FALSE;
3753 break;
3754 }
3755 }
3756
3757 if (success && inputItr.inExpansion()) {
3758 // We otained a match by consuming part of a string obtained from
3759 // case-folding a single code point of the input text.
3760 // This does not count as an overall match.
3761 success = FALSE;
3762 }
3763
3764 if (success) {
3765 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3766 } else {
3767 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3768 }
3769
3770 }
3771 break;
3772
3773 case URX_STO_INP_LOC:
3774 {
3775 U_ASSERT(opValue >= 0 && opValue < fFrameSize);
3776 fp->fExtra[opValue] = fp->fInputIdx;
3777 }
3778 break;
3779
3780 case URX_JMPX:
3781 {
3782 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
3783 fp->fPatIdx += 1;
3784 int32_t dataLoc = URX_VAL(pat[instrOperandLoc]);
3785 U_ASSERT(dataLoc >= 0 && dataLoc < fFrameSize);
3786 int64_t savedInputIdx = fp->fExtra[dataLoc];
3787 U_ASSERT(savedInputIdx <= fp->fInputIdx);
3788 if (savedInputIdx < fp->fInputIdx) {
3789 fp->fPatIdx = opValue; // JMP
3790 } else {
3791 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no progress in loop.
3792 }
3793 }
3794 break;
3795
3796 case URX_LA_START:
3797 {
3798 // Entering a lookahead block.
3799 // Save Stack Ptr, Input Pos.
3800 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
3801 fData[opValue] = fStack->size();
3802 fData[opValue+1] = fp->fInputIdx;
3803 fActiveStart = fLookStart; // Set the match region change for
3804 fActiveLimit = fLookLimit; // transparent bounds.
3805 }
3806 break;
3807
3808 case URX_LA_END:
3809 {
3810 // Leaving a look-ahead block.
3811 // restore Stack Ptr, Input Pos to positions they had on entry to block.
3812 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
3813 int32_t stackSize = fStack->size();
3814 int32_t newStackSize =(int32_t)fData[opValue];
3815 U_ASSERT(stackSize >= newStackSize);
3816 if (stackSize > newStackSize) {
3817 // Copy the current top frame back to the new (cut back) top frame.
3818 // This makes the capture groups from within the look-ahead
3819 // expression available.
3820 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
3821 int32_t i;
3822 for (i=0; i<fFrameSize; i++) {
3823 newFP[i] = ((int64_t *)fp)[i];
3824 }
3825 fp = (REStackFrame *)newFP;
3826 fStack->setSize(newStackSize);
3827 }
3828 fp->fInputIdx = fData[opValue+1];
3829
3830 // Restore the active region bounds in the input string; they may have
3831 // been changed because of transparent bounds on a Region.
3832 fActiveStart = fRegionStart;
3833 fActiveLimit = fRegionLimit;
3834 }
3835 break;
3836
3837 case URX_ONECHAR_I:
3838 // Case insensitive one char. The char from the pattern is already case folded.
3839 // Input text is not, but case folding the input can not reduce two or more code
3840 // points to one.
3841 if (fp->fInputIdx < fActiveLimit) {
3842 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3843
3844 UChar32 c = UTEXT_NEXT32(fInputText);
3845 if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) {
3846 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3847 break;
3848 }
3849 } else {
3850 fHitEnd = TRUE;
3851 }
3852
3853 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3854 break;
3855
3856 case URX_STRING_I:
3857 {
3858 // Case-insensitive test input against a literal string.
3859 // Strings require two slots in the compiled pattern, one for the
3860 // offset to the string text, and one for the length.
3861 // The compiled string has already been case folded.
3862 {
3863 const UChar *patternString = litText + opValue;
3864 int32_t patternStringIdx = 0;
3865
3866 op = (int32_t)pat[fp->fPatIdx];
3867 fp->fPatIdx++;
3868 opType = URX_TYPE(op);
3869 opValue = URX_VAL(op);
3870 U_ASSERT(opType == URX_STRING_LEN);
3871 int32_t patternStringLen = opValue; // Length of the string from the pattern.
3872
3873
3874 UChar32 cPattern;
3875 UChar32 cText;
3876 UBool success = TRUE;
3877
3878 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3879 CaseFoldingUTextIterator inputIterator(*fInputText);
3880 while (patternStringIdx < patternStringLen) {
3881 if (!inputIterator.inExpansion() && UTEXT_GETNATIVEINDEX(fInputText) >= fActiveLimit) {
3882 success = FALSE;
3883 fHitEnd = TRUE;
3884 break;
3885 }
3886 U16_NEXT(patternString, patternStringIdx, patternStringLen, cPattern);
3887 cText = inputIterator.next();
3888 if (cText != cPattern) {
3889 success = FALSE;
3890 break;
3891 }
3892 }
3893 if (inputIterator.inExpansion()) {
3894 success = FALSE;
3895 }
3896
3897 if (success) {
3898 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3899 } else {
3900 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3901 }
3902 }
3903 }
3904 break;
3905
3906 case URX_LB_START:
3907 {
3908 // Entering a look-behind block.
3909 // Save Stack Ptr, Input Pos.
3910 // TODO: implement transparent bounds. Ticket #6067
3911 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
3912 fData[opValue] = fStack->size();
3913 fData[opValue+1] = fp->fInputIdx;
3914 // Init the variable containing the start index for attempted matches.
3915 fData[opValue+2] = -1;
3916 // Save input string length, then reset to pin any matches to end at
3917 // the current position.
3918 fData[opValue+3] = fActiveLimit;
3919 fActiveLimit = fp->fInputIdx;
3920 }
3921 break;
3922
3923
3924 case URX_LB_CONT:
3925 {
3926 // Positive Look-Behind, at top of loop checking for matches of LB expression
3927 // at all possible input starting positions.
3928
3929 // Fetch the min and max possible match lengths. They are the operands
3930 // of this op in the pattern.
3931 int32_t minML = (int32_t)pat[fp->fPatIdx++];
3932 int32_t maxML = (int32_t)pat[fp->fPatIdx++];
3933 if (!UTEXT_USES_U16(fInputText)) {
3934 // utf-8 fix to maximum match length. The pattern compiler assumes utf-16.
3935 // The max length need not be exact; it just needs to be >= actual maximum.
3936 maxML *= 3;
3937 }
3938 U_ASSERT(minML <= maxML);
3939 U_ASSERT(minML >= 0);
3940
3941 // Fetch (from data) the last input index where a match was attempted.
3942 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
3943 int64_t &lbStartIdx = fData[opValue+2];
3944 if (lbStartIdx < 0) {
3945 // First time through loop.
3946 lbStartIdx = fp->fInputIdx - minML;
3947 if (lbStartIdx > 0) {
3948 // move index to a code point boudary, if it's not on one already.
3949 UTEXT_SETNATIVEINDEX(fInputText, lbStartIdx);
3950 lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
3951 }
3952 } else {
3953 // 2nd through nth time through the loop.
3954 // Back up start position for match by one.
3955 if (lbStartIdx == 0) {
3956 (lbStartIdx)--;
3957 } else {
3958 UTEXT_SETNATIVEINDEX(fInputText, lbStartIdx);
3959 (void)UTEXT_PREVIOUS32(fInputText);
3960 lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
3961 }
3962 }
3963
3964 if (lbStartIdx < 0 || lbStartIdx < fp->fInputIdx - maxML) {
3965 // We have tried all potential match starting points without
3966 // getting a match. Backtrack out, and out of the
3967 // Look Behind altogether.
3968 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3969 int64_t restoreInputLen = fData[opValue+3];
3970 U_ASSERT(restoreInputLen >= fActiveLimit);
3971 U_ASSERT(restoreInputLen <= fInputLength);
3972 fActiveLimit = restoreInputLen;
3973 break;
3974 }
3975
3976 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
3977 // (successful match will fall off the end of the loop.)
3978 fp = StateSave(fp, fp->fPatIdx-3, status);
3979 fp->fInputIdx = lbStartIdx;
3980 }
3981 break;
3982
3983 case URX_LB_END:
3984 // End of a look-behind block, after a successful match.
3985 {
3986 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
3987 if (fp->fInputIdx != fActiveLimit) {
3988 // The look-behind expression matched, but the match did not
3989 // extend all the way to the point that we are looking behind from.
3990 // FAIL out of here, which will take us back to the LB_CONT, which
3991 // will retry the match starting at another position or fail
3992 // the look-behind altogether, whichever is appropriate.
3993 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3994 break;
3995 }
3996
3997 // Look-behind match is good. Restore the orignal input string length,
3998 // which had been truncated to pin the end of the lookbehind match to the
3999 // position being looked-behind.
4000 int64_t originalInputLen = fData[opValue+3];
4001 U_ASSERT(originalInputLen >= fActiveLimit);
4002 U_ASSERT(originalInputLen <= fInputLength);
4003 fActiveLimit = originalInputLen;
4004 }
4005 break;
4006
4007
4008 case URX_LBN_CONT:
4009 {
4010 // Negative Look-Behind, at top of loop checking for matches of LB expression
4011 // at all possible input starting positions.
4012
4013 // Fetch the extra parameters of this op.
4014 int32_t minML = (int32_t)pat[fp->fPatIdx++];
4015 int32_t maxML = (int32_t)pat[fp->fPatIdx++];
4016 if (!UTEXT_USES_U16(fInputText)) {
4017 // utf-8 fix to maximum match length. The pattern compiler assumes utf-16.
4018 // The max length need not be exact; it just needs to be >= actual maximum.
4019 maxML *= 3;
4020 }
4021 int32_t continueLoc = (int32_t)pat[fp->fPatIdx++];
4022 continueLoc = URX_VAL(continueLoc);
4023 U_ASSERT(minML <= maxML);
4024 U_ASSERT(minML >= 0);
4025 U_ASSERT(continueLoc > fp->fPatIdx);
4026
4027 // Fetch (from data) the last input index where a match was attempted.
4028 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
4029 int64_t &lbStartIdx = fData[opValue+2];
4030 if (lbStartIdx < 0) {
4031 // First time through loop.
4032 lbStartIdx = fp->fInputIdx - minML;
4033 if (lbStartIdx > 0) {
4034 // move index to a code point boudary, if it's not on one already.
4035 UTEXT_SETNATIVEINDEX(fInputText, lbStartIdx);
4036 lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
4037 }
4038 } else {
4039 // 2nd through nth time through the loop.
4040 // Back up start position for match by one.
4041 if (lbStartIdx == 0) {
4042 (lbStartIdx)--;
4043 } else {
4044 UTEXT_SETNATIVEINDEX(fInputText, lbStartIdx);
4045 (void)UTEXT_PREVIOUS32(fInputText);
4046 lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
4047 }
4048 }
4049
4050 if (lbStartIdx < 0 || lbStartIdx < fp->fInputIdx - maxML) {
4051 // We have tried all potential match starting points without
4052 // getting a match, which means that the negative lookbehind as
4053 // a whole has succeeded. Jump forward to the continue location
4054 int64_t restoreInputLen = fData[opValue+3];
4055 U_ASSERT(restoreInputLen >= fActiveLimit);
4056 U_ASSERT(restoreInputLen <= fInputLength);
4057 fActiveLimit = restoreInputLen;
4058 fp->fPatIdx = continueLoc;
4059 break;
4060 }
4061
4062 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
4063 // (successful match will cause a FAIL out of the loop altogether.)
4064 fp = StateSave(fp, fp->fPatIdx-4, status);
4065 fp->fInputIdx = lbStartIdx;
4066 }
4067 break;
4068
4069 case URX_LBN_END:
4070 // End of a negative look-behind block, after a successful match.
4071 {
4072 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
4073 if (fp->fInputIdx != fActiveLimit) {
4074 // The look-behind expression matched, but the match did not
4075 // extend all the way to the point that we are looking behind from.
4076 // FAIL out of here, which will take us back to the LB_CONT, which
4077 // will retry the match starting at another position or succeed
4078 // the look-behind altogether, whichever is appropriate.
4079 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4080 break;
4081 }
4082
4083 // Look-behind expression matched, which means look-behind test as
4084 // a whole Fails
4085
4086 // Restore the orignal input string length, which had been truncated
4087 // inorder to pin the end of the lookbehind match
4088 // to the position being looked-behind.
4089 int64_t originalInputLen = fData[opValue+3];
4090 U_ASSERT(originalInputLen >= fActiveLimit);
4091 U_ASSERT(originalInputLen <= fInputLength);
4092 fActiveLimit = originalInputLen;
4093
4094 // Restore original stack position, discarding any state saved
4095 // by the successful pattern match.
4096 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
4097 int32_t newStackSize = (int32_t)fData[opValue];
4098 U_ASSERT(fStack->size() > newStackSize);
4099 fStack->setSize(newStackSize);
4100
4101 // FAIL, which will take control back to someplace
4102 // prior to entering the look-behind test.
4103 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4104 }
4105 break;
4106
4107
4108 case URX_LOOP_SR_I:
4109 // Loop Initialization for the optimized implementation of
4110 // [some character set]*
4111 // This op scans through all matching input.
4112 // The following LOOP_C op emulates stack unwinding if the following pattern fails.
4113 {
4114 U_ASSERT(opValue > 0 && opValue < sets->size());
4115 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
4116 UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue);
4117
4118 // Loop through input, until either the input is exhausted or
4119 // we reach a character that is not a member of the set.
4120 int64_t ix = fp->fInputIdx;
4121 UTEXT_SETNATIVEINDEX(fInputText, ix);
4122 for (;;) {
4123 if (ix >= fActiveLimit) {
4124 fHitEnd = TRUE;
4125 break;
4126 }
4127 UChar32 c = UTEXT_NEXT32(fInputText);
4128 if (c<256) {
4129 if (s8->contains(c) == FALSE) {
4130 break;
4131 }
4132 } else {
4133 if (s->contains(c) == FALSE) {
4134 break;
4135 }
4136 }
4137 ix = UTEXT_GETNATIVEINDEX(fInputText);
4138 }
4139
4140 // If there were no matching characters, skip over the loop altogether.
4141 // The loop doesn't run at all, a * op always succeeds.
4142 if (ix == fp->fInputIdx) {
4143 fp->fPatIdx++; // skip the URX_LOOP_C op.
4144 break;
4145 }
4146
4147 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
4148 // must follow. It's operand is the stack location
4149 // that holds the starting input index for the match of this [set]*
4150 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
4151 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
4152 int32_t stackLoc = URX_VAL(loopcOp);
4153 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
4154 fp->fExtra[stackLoc] = fp->fInputIdx;
4155 fp->fInputIdx = ix;
4156
4157 // Save State to the URX_LOOP_C op that follows this one,
4158 // so that match failures in the following code will return to there.
4159 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
4160 fp = StateSave(fp, fp->fPatIdx, status);
4161 fp->fPatIdx++;
4162 }
4163 break;
4164
4165
4166 case URX_LOOP_DOT_I:
4167 // Loop Initialization for the optimized implementation of .*
4168 // This op scans through all remaining input.
4169 // The following LOOP_C op emulates stack unwinding if the following pattern fails.
4170 {
4171 // Loop through input until the input is exhausted (we reach an end-of-line)
4172 // In DOTALL mode, we can just go straight to the end of the input.
4173 int64_t ix;
4174 if ((opValue & 1) == 1) {
4175 // Dot-matches-All mode. Jump straight to the end of the string.
4176 ix = fActiveLimit;
4177 fHitEnd = TRUE;
4178 } else {
4179 // NOT DOT ALL mode. Line endings do not match '.'
4180 // Scan forward until a line ending or end of input.
4181 ix = fp->fInputIdx;
4182 UTEXT_SETNATIVEINDEX(fInputText, ix);
4183 for (;;) {
4184 if (ix >= fActiveLimit) {
4185 fHitEnd = TRUE;
4186 break;
4187 }
4188 UChar32 c = UTEXT_NEXT32(fInputText);
4189 if ((c & 0x7f) <= 0x29) { // Fast filter of non-new-line-s
4190 if ((c == 0x0a) || // 0x0a is newline in both modes.
4191 (((opValue & 2) == 0) && // IF not UNIX_LINES mode
4192 isLineTerminator(c))) {
4193 // char is a line ending. Exit the scanning loop.
4194 break;
4195 }
4196 }
4197 ix = UTEXT_GETNATIVEINDEX(fInputText);
4198 }
4199 }
4200
4201 // If there were no matching characters, skip over the loop altogether.
4202 // The loop doesn't run at all, a * op always succeeds.
4203 if (ix == fp->fInputIdx) {
4204 fp->fPatIdx++; // skip the URX_LOOP_C op.
4205 break;
4206 }
4207
4208 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
4209 // must follow. It's operand is the stack location
4210 // that holds the starting input index for the match of this .*
4211 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
4212 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
4213 int32_t stackLoc = URX_VAL(loopcOp);
4214 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
4215 fp->fExtra[stackLoc] = fp->fInputIdx;
4216 fp->fInputIdx = ix;
4217
4218 // Save State to the URX_LOOP_C op that follows this one,
4219 // so that match failures in the following code will return to there.
4220 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
4221 fp = StateSave(fp, fp->fPatIdx, status);
4222 fp->fPatIdx++;
4223 }
4224 break;
4225
4226
4227 case URX_LOOP_C:
4228 {
4229 U_ASSERT(opValue>=0 && opValue<fFrameSize);
4230 backSearchIndex = fp->fExtra[opValue];
4231 U_ASSERT(backSearchIndex <= fp->fInputIdx);
4232 if (backSearchIndex == fp->fInputIdx) {
4233 // We've backed up the input idx to the point that the loop started.
4234 // The loop is done. Leave here without saving state.
4235 // Subsequent failures won't come back here.
4236 break;
4237 }
4238 // Set up for the next iteration of the loop, with input index
4239 // backed up by one from the last time through,
4240 // and a state save to this instruction in case the following code fails again.
4241 // (We're going backwards because this loop emulates stack unwinding, not
4242 // the initial scan forward.)
4243 U_ASSERT(fp->fInputIdx > 0);
4244 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
4245 UChar32 prevC = UTEXT_PREVIOUS32(fInputText);
4246 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
4247
4248 UChar32 twoPrevC = UTEXT_PREVIOUS32(fInputText);
4249 if (prevC == 0x0a &&
4250 fp->fInputIdx > backSearchIndex &&
4251 twoPrevC == 0x0d) {
4252 int32_t prevOp = (int32_t)pat[fp->fPatIdx-2];
4253 if (URX_TYPE(prevOp) == URX_LOOP_DOT_I) {
4254 // .*, stepping back over CRLF pair.
4255 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
4256 }
4257 }
4258
4259
4260 fp = StateSave(fp, fp->fPatIdx-1, status);
4261 }
4262 break;
4263
4264
4265
4266 default:
4267 // Trouble. The compiled pattern contains an entry with an
4268 // unrecognized type tag.
4269 U_ASSERT(FALSE);
4270 }
4271
4272 if (U_FAILURE(status)) {
4273 isMatch = FALSE;
4274 break;
4275 }
4276 }
4277
4278 breakFromLoop:
4279 fMatch = isMatch;
4280 if (isMatch) {
4281 fLastMatchEnd = fMatchEnd;
4282 fMatchStart = startIdx;
4283 fMatchEnd = fp->fInputIdx;
4284 }
4285
4286 #ifdef REGEX_RUN_DEBUG
4287 if (fTraceDebug) {
4288 if (isMatch) {
4289 printf("Match. start=%ld end=%ld\n\n", fMatchStart, fMatchEnd);
4290 } else {
4291 printf("No match\n\n");
4292 }
4293 }
4294 #endif
4295
4296 fFrame = fp; // The active stack frame when the engine stopped.
4297 // Contains the capture group results that we need to
4298 // access later.
4299 return;
4300 }
4301
4302
4303 //--------------------------------------------------------------------------------
4304 //
4305 // MatchChunkAt This is the actual matching engine. Like MatchAt, but with the
4306 // assumption that the entire string is available in the UText's
4307 // chunk buffer. For now, that means we can use int32_t indexes,
4308 // except for anything that needs to be saved (like group starts
4309 // and ends).
4310 //
4311 // startIdx: begin matching a this index.
4312 // toEnd: if true, match must extend to end of the input region
4313 //
4314 //--------------------------------------------------------------------------------
MatchChunkAt(int32_t startIdx,UBool toEnd,UErrorCode & status)4315 void RegexMatcher::MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &status) {
4316 UBool isMatch = FALSE; // True if the we have a match.
4317
4318 int32_t backSearchIndex = INT32_MAX; // used after greedy single-character matches for searching backwards
4319
4320 int32_t op; // Operation from the compiled pattern, split into
4321 int32_t opType; // the opcode
4322 int32_t opValue; // and the operand value.
4323
4324 #ifdef REGEX_RUN_DEBUG
4325 if (fTraceDebug) {
4326 printf("MatchAt(startIdx=%d)\n", startIdx);
4327 printf("Original Pattern: \"%s\"\n", CStr(StringFromUText(fPattern->fPattern))());
4328 printf("Input String: \"%s\"\n\n", CStr(StringFromUText(fInputText))());
4329 }
4330 #endif
4331
4332 if (U_FAILURE(status)) {
4333 return;
4334 }
4335
4336 // Cache frequently referenced items from the compiled pattern
4337 //
4338 int64_t *pat = fPattern->fCompiledPat->getBuffer();
4339
4340 const UChar *litText = fPattern->fLiteralText.getBuffer();
4341 UVector *sets = fPattern->fSets;
4342
4343 const UChar *inputBuf = fInputText->chunkContents;
4344
4345 fFrameSize = fPattern->fFrameSize;
4346 REStackFrame *fp = resetStack();
4347 if (U_FAILURE(fDeferredStatus)) {
4348 status = fDeferredStatus;
4349 return;
4350 }
4351
4352 fp->fPatIdx = 0;
4353 fp->fInputIdx = startIdx;
4354
4355 // Zero out the pattern's static data
4356 int32_t i;
4357 for (i = 0; i<fPattern->fDataSize; i++) {
4358 fData[i] = 0;
4359 }
4360
4361 //
4362 // Main loop for interpreting the compiled pattern.
4363 // One iteration of the loop per pattern operation performed.
4364 //
4365 for (;;) {
4366 op = (int32_t)pat[fp->fPatIdx];
4367 opType = URX_TYPE(op);
4368 opValue = URX_VAL(op);
4369 #ifdef REGEX_RUN_DEBUG
4370 if (fTraceDebug) {
4371 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
4372 printf("inputIdx=%ld inputChar=%x sp=%3ld activeLimit=%ld ", fp->fInputIdx,
4373 UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(), fActiveLimit);
4374 fPattern->dumpOp(fp->fPatIdx);
4375 }
4376 #endif
4377 fp->fPatIdx++;
4378
4379 switch (opType) {
4380
4381
4382 case URX_NOP:
4383 break;
4384
4385
4386 case URX_BACKTRACK:
4387 // Force a backtrack. In some circumstances, the pattern compiler
4388 // will notice that the pattern can't possibly match anything, and will
4389 // emit one of these at that point.
4390 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4391 break;
4392
4393
4394 case URX_ONECHAR:
4395 if (fp->fInputIdx < fActiveLimit) {
4396 UChar32 c;
4397 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4398 if (c == opValue) {
4399 break;
4400 }
4401 } else {
4402 fHitEnd = TRUE;
4403 }
4404 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4405 break;
4406
4407
4408 case URX_STRING:
4409 {
4410 // Test input against a literal string.
4411 // Strings require two slots in the compiled pattern, one for the
4412 // offset to the string text, and one for the length.
4413 int32_t stringStartIdx = opValue;
4414 int32_t stringLen;
4415
4416 op = (int32_t)pat[fp->fPatIdx]; // Fetch the second operand
4417 fp->fPatIdx++;
4418 opType = URX_TYPE(op);
4419 stringLen = URX_VAL(op);
4420 U_ASSERT(opType == URX_STRING_LEN);
4421 U_ASSERT(stringLen >= 2);
4422
4423 const UChar * pInp = inputBuf + fp->fInputIdx;
4424 const UChar * pInpLimit = inputBuf + fActiveLimit;
4425 const UChar * pPat = litText+stringStartIdx;
4426 const UChar * pEnd = pInp + stringLen;
4427 UBool success = TRUE;
4428 while (pInp < pEnd) {
4429 if (pInp >= pInpLimit) {
4430 fHitEnd = TRUE;
4431 success = FALSE;
4432 break;
4433 }
4434 if (*pInp++ != *pPat++) {
4435 success = FALSE;
4436 break;
4437 }
4438 }
4439
4440 if (success) {
4441 fp->fInputIdx += stringLen;
4442 } else {
4443 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4444 }
4445 }
4446 break;
4447
4448
4449 case URX_STATE_SAVE:
4450 fp = StateSave(fp, opValue, status);
4451 break;
4452
4453
4454 case URX_END:
4455 // The match loop will exit via this path on a successful match,
4456 // when we reach the end of the pattern.
4457 if (toEnd && fp->fInputIdx != fActiveLimit) {
4458 // The pattern matched, but not to the end of input. Try some more.
4459 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4460 break;
4461 }
4462 isMatch = TRUE;
4463 goto breakFromLoop;
4464
4465 // Start and End Capture stack frame variables are laid out out like this:
4466 // fp->fExtra[opValue] - The start of a completed capture group
4467 // opValue+1 - The end of a completed capture group
4468 // opValue+2 - the start of a capture group whose end
4469 // has not yet been reached (and might not ever be).
4470 case URX_START_CAPTURE:
4471 U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
4472 fp->fExtra[opValue+2] = fp->fInputIdx;
4473 break;
4474
4475
4476 case URX_END_CAPTURE:
4477 U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
4478 U_ASSERT(fp->fExtra[opValue+2] >= 0); // Start pos for this group must be set.
4479 fp->fExtra[opValue] = fp->fExtra[opValue+2]; // Tentative start becomes real.
4480 fp->fExtra[opValue+1] = fp->fInputIdx; // End position
4481 U_ASSERT(fp->fExtra[opValue] <= fp->fExtra[opValue+1]);
4482 break;
4483
4484
4485 case URX_DOLLAR: // $, test for End of line
4486 // or for position before new line at end of input
4487 if (fp->fInputIdx < fAnchorLimit-2) {
4488 // We are no where near the end of input. Fail.
4489 // This is the common case. Keep it first.
4490 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4491 break;
4492 }
4493 if (fp->fInputIdx >= fAnchorLimit) {
4494 // We really are at the end of input. Success.
4495 fHitEnd = TRUE;
4496 fRequireEnd = TRUE;
4497 break;
4498 }
4499
4500 // If we are positioned just before a new-line that is located at the
4501 // end of input, succeed.
4502 if (fp->fInputIdx == fAnchorLimit-1) {
4503 UChar32 c;
4504 U16_GET(inputBuf, fAnchorStart, fp->fInputIdx, fAnchorLimit, c);
4505
4506 if (isLineTerminator(c)) {
4507 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp->fInputIdx-1]==0x0d)) {
4508 // At new-line at end of input. Success
4509 fHitEnd = TRUE;
4510 fRequireEnd = TRUE;
4511 break;
4512 }
4513 }
4514 } else if (fp->fInputIdx == fAnchorLimit-2 &&
4515 inputBuf[fp->fInputIdx]==0x0d && inputBuf[fp->fInputIdx+1]==0x0a) {
4516 fHitEnd = TRUE;
4517 fRequireEnd = TRUE;
4518 break; // At CR/LF at end of input. Success
4519 }
4520
4521 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4522
4523 break;
4524
4525
4526 case URX_DOLLAR_D: // $, test for End of Line, in UNIX_LINES mode.
4527 if (fp->fInputIdx >= fAnchorLimit-1) {
4528 // Either at the last character of input, or off the end.
4529 if (fp->fInputIdx == fAnchorLimit-1) {
4530 // At last char of input. Success if it's a new line.
4531 if (inputBuf[fp->fInputIdx] == 0x0a) {
4532 fHitEnd = TRUE;
4533 fRequireEnd = TRUE;
4534 break;
4535 }
4536 } else {
4537 // Off the end of input. Success.
4538 fHitEnd = TRUE;
4539 fRequireEnd = TRUE;
4540 break;
4541 }
4542 }
4543
4544 // Not at end of input. Back-track out.
4545 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4546 break;
4547
4548
4549 case URX_DOLLAR_M: // $, test for End of line in multi-line mode
4550 {
4551 if (fp->fInputIdx >= fAnchorLimit) {
4552 // We really are at the end of input. Success.
4553 fHitEnd = TRUE;
4554 fRequireEnd = TRUE;
4555 break;
4556 }
4557 // If we are positioned just before a new-line, succeed.
4558 // It makes no difference where the new-line is within the input.
4559 UChar32 c = inputBuf[fp->fInputIdx];
4560 if (isLineTerminator(c)) {
4561 // At a line end, except for the odd chance of being in the middle of a CR/LF sequence
4562 // In multi-line mode, hitting a new-line just before the end of input does not
4563 // set the hitEnd or requireEnd flags
4564 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp->fInputIdx-1]==0x0d)) {
4565 break;
4566 }
4567 }
4568 // not at a new line. Fail.
4569 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4570 }
4571 break;
4572
4573
4574 case URX_DOLLAR_MD: // $, test for End of line in multi-line and UNIX_LINES mode
4575 {
4576 if (fp->fInputIdx >= fAnchorLimit) {
4577 // We really are at the end of input. Success.
4578 fHitEnd = TRUE;
4579 fRequireEnd = TRUE; // Java set requireEnd in this case, even though
4580 break; // adding a new-line would not lose the match.
4581 }
4582 // If we are not positioned just before a new-line, the test fails; backtrack out.
4583 // It makes no difference where the new-line is within the input.
4584 if (inputBuf[fp->fInputIdx] != 0x0a) {
4585 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4586 }
4587 }
4588 break;
4589
4590
4591 case URX_CARET: // ^, test for start of line
4592 if (fp->fInputIdx != fAnchorStart) {
4593 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4594 }
4595 break;
4596
4597
4598 case URX_CARET_M: // ^, test for start of line in mulit-line mode
4599 {
4600 if (fp->fInputIdx == fAnchorStart) {
4601 // We are at the start input. Success.
4602 break;
4603 }
4604 // Check whether character just before the current pos is a new-line
4605 // unless we are at the end of input
4606 UChar c = inputBuf[fp->fInputIdx - 1];
4607 if ((fp->fInputIdx < fAnchorLimit) &&
4608 isLineTerminator(c)) {
4609 // It's a new-line. ^ is true. Success.
4610 // TODO: what should be done with positions between a CR and LF?
4611 break;
4612 }
4613 // Not at the start of a line. Fail.
4614 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4615 }
4616 break;
4617
4618
4619 case URX_CARET_M_UNIX: // ^, test for start of line in mulit-line + Unix-line mode
4620 {
4621 U_ASSERT(fp->fInputIdx >= fAnchorStart);
4622 if (fp->fInputIdx <= fAnchorStart) {
4623 // We are at the start input. Success.
4624 break;
4625 }
4626 // Check whether character just before the current pos is a new-line
4627 U_ASSERT(fp->fInputIdx <= fAnchorLimit);
4628 UChar c = inputBuf[fp->fInputIdx - 1];
4629 if (c != 0x0a) {
4630 // Not at the start of a line. Back-track out.
4631 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4632 }
4633 }
4634 break;
4635
4636 case URX_BACKSLASH_B: // Test for word boundaries
4637 {
4638 UBool success = isChunkWordBoundary((int32_t)fp->fInputIdx);
4639 success ^= (UBool)(opValue != 0); // flip sense for \B
4640 if (!success) {
4641 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4642 }
4643 }
4644 break;
4645
4646
4647 case URX_BACKSLASH_BU: // Test for word boundaries, Unicode-style
4648 {
4649 UBool success = isUWordBoundary(fp->fInputIdx);
4650 success ^= (UBool)(opValue != 0); // flip sense for \B
4651 if (!success) {
4652 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4653 }
4654 }
4655 break;
4656
4657
4658 case URX_BACKSLASH_D: // Test for decimal digit
4659 {
4660 if (fp->fInputIdx >= fActiveLimit) {
4661 fHitEnd = TRUE;
4662 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4663 break;
4664 }
4665
4666 UChar32 c;
4667 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4668 int8_t ctype = u_charType(c); // TODO: make a unicode set for this. Will be faster.
4669 UBool success = (ctype == U_DECIMAL_DIGIT_NUMBER);
4670 success ^= (UBool)(opValue != 0); // flip sense for \D
4671 if (!success) {
4672 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4673 }
4674 }
4675 break;
4676
4677
4678 case URX_BACKSLASH_G: // Test for position at end of previous match
4679 if (!((fMatch && fp->fInputIdx==fMatchEnd) || (fMatch==FALSE && fp->fInputIdx==fActiveStart))) {
4680 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4681 }
4682 break;
4683
4684
4685 case URX_BACKSLASH_H: // Test for \h, horizontal white space.
4686 {
4687 if (fp->fInputIdx >= fActiveLimit) {
4688 fHitEnd = TRUE;
4689 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4690 break;
4691 }
4692 UChar32 c;
4693 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4694 int8_t ctype = u_charType(c);
4695 UBool success = (ctype == U_SPACE_SEPARATOR || c == 9); // SPACE_SEPARATOR || TAB
4696 success ^= (UBool)(opValue != 0); // flip sense for \H
4697 if (!success) {
4698 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4699 }
4700 }
4701 break;
4702
4703
4704 case URX_BACKSLASH_R: // Test for \R, any line break sequence.
4705 {
4706 if (fp->fInputIdx >= fActiveLimit) {
4707 fHitEnd = TRUE;
4708 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4709 break;
4710 }
4711 UChar32 c;
4712 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4713 if (isLineTerminator(c)) {
4714 if (c == 0x0d && fp->fInputIdx < fActiveLimit) {
4715 // Check for CR/LF sequence. Consume both together when found.
4716 UChar c2;
4717 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c2);
4718 if (c2 != 0x0a) {
4719 U16_PREV(inputBuf, 0, fp->fInputIdx, c2);
4720 }
4721 }
4722 } else {
4723 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4724 }
4725 }
4726 break;
4727
4728
4729 case URX_BACKSLASH_V: // Any single code point line ending.
4730 {
4731 if (fp->fInputIdx >= fActiveLimit) {
4732 fHitEnd = TRUE;
4733 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4734 break;
4735 }
4736 UChar32 c;
4737 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4738 UBool success = isLineTerminator(c);
4739 success ^= (UBool)(opValue != 0); // flip sense for \V
4740 if (!success) {
4741 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4742 }
4743 }
4744 break;
4745
4746
4747
4748 case URX_BACKSLASH_X:
4749 // Match a Grapheme, as defined by Unicode TR 29.
4750 // Differs slightly from Perl, which consumes combining marks independently
4751 // of context.
4752 {
4753
4754 // Fail if at end of input
4755 if (fp->fInputIdx >= fActiveLimit) {
4756 fHitEnd = TRUE;
4757 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4758 break;
4759 }
4760
4761 // Examine (and consume) the current char.
4762 // Dispatch into a little state machine, based on the char.
4763 UChar32 c;
4764 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4765 UnicodeSet **sets = fPattern->fStaticSets;
4766 if (sets[URX_GC_NORMAL]->contains(c)) goto GC_Extend;
4767 if (sets[URX_GC_CONTROL]->contains(c)) goto GC_Control;
4768 if (sets[URX_GC_L]->contains(c)) goto GC_L;
4769 if (sets[URX_GC_LV]->contains(c)) goto GC_V;
4770 if (sets[URX_GC_LVT]->contains(c)) goto GC_T;
4771 if (sets[URX_GC_V]->contains(c)) goto GC_V;
4772 if (sets[URX_GC_T]->contains(c)) goto GC_T;
4773 goto GC_Extend;
4774
4775
4776
4777 GC_L:
4778 if (fp->fInputIdx >= fActiveLimit) goto GC_Done;
4779 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4780 if (sets[URX_GC_L]->contains(c)) goto GC_L;
4781 if (sets[URX_GC_LV]->contains(c)) goto GC_V;
4782 if (sets[URX_GC_LVT]->contains(c)) goto GC_T;
4783 if (sets[URX_GC_V]->contains(c)) goto GC_V;
4784 U16_PREV(inputBuf, 0, fp->fInputIdx, c);
4785 goto GC_Extend;
4786
4787 GC_V:
4788 if (fp->fInputIdx >= fActiveLimit) goto GC_Done;
4789 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4790 if (sets[URX_GC_V]->contains(c)) goto GC_V;
4791 if (sets[URX_GC_T]->contains(c)) goto GC_T;
4792 U16_PREV(inputBuf, 0, fp->fInputIdx, c);
4793 goto GC_Extend;
4794
4795 GC_T:
4796 if (fp->fInputIdx >= fActiveLimit) goto GC_Done;
4797 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4798 if (sets[URX_GC_T]->contains(c)) goto GC_T;
4799 U16_PREV(inputBuf, 0, fp->fInputIdx, c);
4800 goto GC_Extend;
4801
4802 GC_Extend:
4803 // Combining characters are consumed here
4804 for (;;) {
4805 if (fp->fInputIdx >= fActiveLimit) {
4806 break;
4807 }
4808 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4809 if (sets[URX_GC_EXTEND]->contains(c) == FALSE) {
4810 U16_BACK_1(inputBuf, 0, fp->fInputIdx);
4811 break;
4812 }
4813 }
4814 goto GC_Done;
4815
4816 GC_Control:
4817 // Most control chars stand alone (don't combine with combining chars),
4818 // except for that CR/LF sequence is a single grapheme cluster.
4819 if (c == 0x0d && fp->fInputIdx < fActiveLimit && inputBuf[fp->fInputIdx] == 0x0a) {
4820 fp->fInputIdx++;
4821 }
4822
4823 GC_Done:
4824 if (fp->fInputIdx >= fActiveLimit) {
4825 fHitEnd = TRUE;
4826 }
4827 break;
4828 }
4829
4830
4831
4832
4833 case URX_BACKSLASH_Z: // Test for end of Input
4834 if (fp->fInputIdx < fAnchorLimit) {
4835 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4836 } else {
4837 fHitEnd = TRUE;
4838 fRequireEnd = TRUE;
4839 }
4840 break;
4841
4842
4843
4844 case URX_STATIC_SETREF:
4845 {
4846 // Test input character against one of the predefined sets
4847 // (Word Characters, for example)
4848 // The high bit of the op value is a flag for the match polarity.
4849 // 0: success if input char is in set.
4850 // 1: success if input char is not in set.
4851 if (fp->fInputIdx >= fActiveLimit) {
4852 fHitEnd = TRUE;
4853 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4854 break;
4855 }
4856
4857 UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET);
4858 opValue &= ~URX_NEG_SET;
4859 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
4860
4861 UChar32 c;
4862 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4863 if (c < 256) {
4864 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
4865 if (s8->contains(c)) {
4866 success = !success;
4867 }
4868 } else {
4869 const UnicodeSet *s = fPattern->fStaticSets[opValue];
4870 if (s->contains(c)) {
4871 success = !success;
4872 }
4873 }
4874 if (!success) {
4875 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4876 }
4877 }
4878 break;
4879
4880
4881 case URX_STAT_SETREF_N:
4882 {
4883 // Test input character for NOT being a member of one of
4884 // the predefined sets (Word Characters, for example)
4885 if (fp->fInputIdx >= fActiveLimit) {
4886 fHitEnd = TRUE;
4887 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4888 break;
4889 }
4890
4891 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
4892
4893 UChar32 c;
4894 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4895 if (c < 256) {
4896 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
4897 if (s8->contains(c) == FALSE) {
4898 break;
4899 }
4900 } else {
4901 const UnicodeSet *s = fPattern->fStaticSets[opValue];
4902 if (s->contains(c) == FALSE) {
4903 break;
4904 }
4905 }
4906 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4907 }
4908 break;
4909
4910
4911 case URX_SETREF:
4912 {
4913 if (fp->fInputIdx >= fActiveLimit) {
4914 fHitEnd = TRUE;
4915 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4916 break;
4917 }
4918
4919 U_ASSERT(opValue > 0 && opValue < sets->size());
4920
4921 // There is input left. Pick up one char and test it for set membership.
4922 UChar32 c;
4923 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4924 if (c<256) {
4925 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
4926 if (s8->contains(c)) {
4927 // The character is in the set. A Match.
4928 break;
4929 }
4930 } else {
4931 UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue);
4932 if (s->contains(c)) {
4933 // The character is in the set. A Match.
4934 break;
4935 }
4936 }
4937
4938 // the character wasn't in the set.
4939 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4940 }
4941 break;
4942
4943
4944 case URX_DOTANY:
4945 {
4946 // . matches anything, but stops at end-of-line.
4947 if (fp->fInputIdx >= fActiveLimit) {
4948 // At end of input. Match failed. Backtrack out.
4949 fHitEnd = TRUE;
4950 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4951 break;
4952 }
4953
4954 // There is input left. Advance over one char, unless we've hit end-of-line
4955 UChar32 c;
4956 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4957 if (isLineTerminator(c)) {
4958 // End of line in normal mode. . does not match.
4959 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4960 break;
4961 }
4962 }
4963 break;
4964
4965
4966 case URX_DOTANY_ALL:
4967 {
4968 // . in dot-matches-all (including new lines) mode
4969 if (fp->fInputIdx >= fActiveLimit) {
4970 // At end of input. Match failed. Backtrack out.
4971 fHitEnd = TRUE;
4972 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4973 break;
4974 }
4975
4976 // There is input left. Advance over one char, except if we are
4977 // at a cr/lf, advance over both of them.
4978 UChar32 c;
4979 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4980 if (c==0x0d && fp->fInputIdx < fActiveLimit) {
4981 // In the case of a CR/LF, we need to advance over both.
4982 if (inputBuf[fp->fInputIdx] == 0x0a) {
4983 U16_FWD_1(inputBuf, fp->fInputIdx, fActiveLimit);
4984 }
4985 }
4986 }
4987 break;
4988
4989
4990 case URX_DOTANY_UNIX:
4991 {
4992 // '.' operator, matches all, but stops at end-of-line.
4993 // UNIX_LINES mode, so 0x0a is the only recognized line ending.
4994 if (fp->fInputIdx >= fActiveLimit) {
4995 // At end of input. Match failed. Backtrack out.
4996 fHitEnd = TRUE;
4997 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4998 break;
4999 }
5000
5001 // There is input left. Advance over one char, unless we've hit end-of-line
5002 UChar32 c;
5003 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
5004 if (c == 0x0a) {
5005 // End of line in normal mode. '.' does not match the \n
5006 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5007 }
5008 }
5009 break;
5010
5011
5012 case URX_JMP:
5013 fp->fPatIdx = opValue;
5014 break;
5015
5016 case URX_FAIL:
5017 isMatch = FALSE;
5018 goto breakFromLoop;
5019
5020 case URX_JMP_SAV:
5021 U_ASSERT(opValue < fPattern->fCompiledPat->size());
5022 fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current
5023 fp->fPatIdx = opValue; // Then JMP.
5024 break;
5025
5026 case URX_JMP_SAV_X:
5027 // This opcode is used with (x)+, when x can match a zero length string.
5028 // Same as JMP_SAV, except conditional on the match having made forward progress.
5029 // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the
5030 // data address of the input position at the start of the loop.
5031 {
5032 U_ASSERT(opValue > 0 && opValue < fPattern->fCompiledPat->size());
5033 int32_t stoOp = (int32_t)pat[opValue-1];
5034 U_ASSERT(URX_TYPE(stoOp) == URX_STO_INP_LOC);
5035 int32_t frameLoc = URX_VAL(stoOp);
5036 U_ASSERT(frameLoc >= 0 && frameLoc < fFrameSize);
5037 int32_t prevInputIdx = (int32_t)fp->fExtra[frameLoc];
5038 U_ASSERT(prevInputIdx <= fp->fInputIdx);
5039 if (prevInputIdx < fp->fInputIdx) {
5040 // The match did make progress. Repeat the loop.
5041 fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current
5042 fp->fPatIdx = opValue;
5043 fp->fExtra[frameLoc] = fp->fInputIdx;
5044 }
5045 // If the input position did not advance, we do nothing here,
5046 // execution will fall out of the loop.
5047 }
5048 break;
5049
5050 case URX_CTR_INIT:
5051 {
5052 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
5053 fp->fExtra[opValue] = 0; // Set the loop counter variable to zero
5054
5055 // Pick up the three extra operands that CTR_INIT has, and
5056 // skip the pattern location counter past
5057 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
5058 fp->fPatIdx += 3;
5059 int32_t loopLoc = URX_VAL(pat[instrOperandLoc]);
5060 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
5061 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
5062 U_ASSERT(minCount>=0);
5063 U_ASSERT(maxCount>=minCount || maxCount==-1);
5064 U_ASSERT(loopLoc>=fp->fPatIdx);
5065
5066 if (minCount == 0) {
5067 fp = StateSave(fp, loopLoc+1, status);
5068 }
5069 if (maxCount == -1) {
5070 fp->fExtra[opValue+1] = fp->fInputIdx; // For loop breaking.
5071 } else if (maxCount == 0) {
5072 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5073 }
5074 }
5075 break;
5076
5077 case URX_CTR_LOOP:
5078 {
5079 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
5080 int32_t initOp = (int32_t)pat[opValue];
5081 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT);
5082 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
5083 int32_t minCount = (int32_t)pat[opValue+2];
5084 int32_t maxCount = (int32_t)pat[opValue+3];
5085 (*pCounter)++;
5086 if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) {
5087 U_ASSERT(*pCounter == maxCount);
5088 break;
5089 }
5090 if (*pCounter >= minCount) {
5091 if (maxCount == -1) {
5092 // Loop has no hard upper bound.
5093 // Check that it is progressing through the input, break if it is not.
5094 int64_t *pLastInputIdx = &fp->fExtra[URX_VAL(initOp) + 1];
5095 if (fp->fInputIdx == *pLastInputIdx) {
5096 break;
5097 } else {
5098 *pLastInputIdx = fp->fInputIdx;
5099 }
5100 }
5101 fp = StateSave(fp, fp->fPatIdx, status);
5102 }
5103 fp->fPatIdx = opValue + 4; // Loop back.
5104 }
5105 break;
5106
5107 case URX_CTR_INIT_NG:
5108 {
5109 // Initialize a non-greedy loop
5110 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
5111 fp->fExtra[opValue] = 0; // Set the loop counter variable to zero
5112
5113 // Pick up the three extra operands that CTR_INIT_NG has, and
5114 // skip the pattern location counter past
5115 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
5116 fp->fPatIdx += 3;
5117 int32_t loopLoc = URX_VAL(pat[instrOperandLoc]);
5118 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
5119 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
5120 U_ASSERT(minCount>=0);
5121 U_ASSERT(maxCount>=minCount || maxCount==-1);
5122 U_ASSERT(loopLoc>fp->fPatIdx);
5123 if (maxCount == -1) {
5124 fp->fExtra[opValue+1] = fp->fInputIdx; // Save initial input index for loop breaking.
5125 }
5126
5127 if (minCount == 0) {
5128 if (maxCount != 0) {
5129 fp = StateSave(fp, fp->fPatIdx, status);
5130 }
5131 fp->fPatIdx = loopLoc+1; // Continue with stuff after repeated block
5132 }
5133 }
5134 break;
5135
5136 case URX_CTR_LOOP_NG:
5137 {
5138 // Non-greedy {min, max} loops
5139 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
5140 int32_t initOp = (int32_t)pat[opValue];
5141 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT_NG);
5142 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
5143 int32_t minCount = (int32_t)pat[opValue+2];
5144 int32_t maxCount = (int32_t)pat[opValue+3];
5145
5146 (*pCounter)++;
5147 if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) {
5148 // The loop has matched the maximum permitted number of times.
5149 // Break out of here with no action. Matching will
5150 // continue with the following pattern.
5151 U_ASSERT(*pCounter == maxCount);
5152 break;
5153 }
5154
5155 if (*pCounter < minCount) {
5156 // We haven't met the minimum number of matches yet.
5157 // Loop back for another one.
5158 fp->fPatIdx = opValue + 4; // Loop back.
5159 } else {
5160 // We do have the minimum number of matches.
5161
5162 // If there is no upper bound on the loop iterations, check that the input index
5163 // is progressing, and stop the loop if it is not.
5164 if (maxCount == -1) {
5165 int64_t *pLastInputIdx = &fp->fExtra[URX_VAL(initOp) + 1];
5166 if (fp->fInputIdx == *pLastInputIdx) {
5167 break;
5168 }
5169 *pLastInputIdx = fp->fInputIdx;
5170 }
5171
5172 // Loop Continuation: we will fall into the pattern following the loop
5173 // (non-greedy, don't execute loop body first), but first do
5174 // a state save to the top of the loop, so that a match failure
5175 // in the following pattern will try another iteration of the loop.
5176 fp = StateSave(fp, opValue + 4, status);
5177 }
5178 }
5179 break;
5180
5181 case URX_STO_SP:
5182 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
5183 fData[opValue] = fStack->size();
5184 break;
5185
5186 case URX_LD_SP:
5187 {
5188 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
5189 int32_t newStackSize = (int32_t)fData[opValue];
5190 U_ASSERT(newStackSize <= fStack->size());
5191 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
5192 if (newFP == (int64_t *)fp) {
5193 break;
5194 }
5195 int32_t i;
5196 for (i=0; i<fFrameSize; i++) {
5197 newFP[i] = ((int64_t *)fp)[i];
5198 }
5199 fp = (REStackFrame *)newFP;
5200 fStack->setSize(newStackSize);
5201 }
5202 break;
5203
5204 case URX_BACKREF:
5205 {
5206 U_ASSERT(opValue < fFrameSize);
5207 int64_t groupStartIdx = fp->fExtra[opValue];
5208 int64_t groupEndIdx = fp->fExtra[opValue+1];
5209 U_ASSERT(groupStartIdx <= groupEndIdx);
5210 int64_t inputIndex = fp->fInputIdx;
5211 if (groupStartIdx < 0) {
5212 // This capture group has not participated in the match thus far,
5213 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match.
5214 break;
5215 }
5216 UBool success = TRUE;
5217 for (int64_t groupIndex = groupStartIdx; groupIndex < groupEndIdx; ++groupIndex,++inputIndex) {
5218 if (inputIndex >= fActiveLimit) {
5219 success = FALSE;
5220 fHitEnd = TRUE;
5221 break;
5222 }
5223 if (inputBuf[groupIndex] != inputBuf[inputIndex]) {
5224 success = FALSE;
5225 break;
5226 }
5227 }
5228 if (success && groupStartIdx < groupEndIdx && U16_IS_LEAD(inputBuf[groupEndIdx-1]) &&
5229 inputIndex < fActiveLimit && U16_IS_TRAIL(inputBuf[inputIndex])) {
5230 // Capture group ended with an unpaired lead surrogate.
5231 // Back reference is not permitted to match lead only of a surrogatge pair.
5232 success = FALSE;
5233 }
5234 if (success) {
5235 fp->fInputIdx = inputIndex;
5236 } else {
5237 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5238 }
5239 }
5240 break;
5241
5242 case URX_BACKREF_I:
5243 {
5244 U_ASSERT(opValue < fFrameSize);
5245 int64_t groupStartIdx = fp->fExtra[opValue];
5246 int64_t groupEndIdx = fp->fExtra[opValue+1];
5247 U_ASSERT(groupStartIdx <= groupEndIdx);
5248 if (groupStartIdx < 0) {
5249 // This capture group has not participated in the match thus far,
5250 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match.
5251 break;
5252 }
5253 CaseFoldingUCharIterator captureGroupItr(inputBuf, groupStartIdx, groupEndIdx);
5254 CaseFoldingUCharIterator inputItr(inputBuf, fp->fInputIdx, fActiveLimit);
5255
5256 // Note: if the capture group match was of an empty string the backref
5257 // match succeeds. Verified by testing: Perl matches succeed
5258 // in this case, so we do too.
5259
5260 UBool success = TRUE;
5261 for (;;) {
5262 UChar32 captureGroupChar = captureGroupItr.next();
5263 if (captureGroupChar == U_SENTINEL) {
5264 success = TRUE;
5265 break;
5266 }
5267 UChar32 inputChar = inputItr.next();
5268 if (inputChar == U_SENTINEL) {
5269 success = FALSE;
5270 fHitEnd = TRUE;
5271 break;
5272 }
5273 if (inputChar != captureGroupChar) {
5274 success = FALSE;
5275 break;
5276 }
5277 }
5278
5279 if (success && inputItr.inExpansion()) {
5280 // We otained a match by consuming part of a string obtained from
5281 // case-folding a single code point of the input text.
5282 // This does not count as an overall match.
5283 success = FALSE;
5284 }
5285
5286 if (success) {
5287 fp->fInputIdx = inputItr.getIndex();
5288 } else {
5289 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5290 }
5291 }
5292 break;
5293
5294 case URX_STO_INP_LOC:
5295 {
5296 U_ASSERT(opValue >= 0 && opValue < fFrameSize);
5297 fp->fExtra[opValue] = fp->fInputIdx;
5298 }
5299 break;
5300
5301 case URX_JMPX:
5302 {
5303 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
5304 fp->fPatIdx += 1;
5305 int32_t dataLoc = URX_VAL(pat[instrOperandLoc]);
5306 U_ASSERT(dataLoc >= 0 && dataLoc < fFrameSize);
5307 int32_t savedInputIdx = (int32_t)fp->fExtra[dataLoc];
5308 U_ASSERT(savedInputIdx <= fp->fInputIdx);
5309 if (savedInputIdx < fp->fInputIdx) {
5310 fp->fPatIdx = opValue; // JMP
5311 } else {
5312 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no progress in loop.
5313 }
5314 }
5315 break;
5316
5317 case URX_LA_START:
5318 {
5319 // Entering a lookahead block.
5320 // Save Stack Ptr, Input Pos.
5321 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5322 fData[opValue] = fStack->size();
5323 fData[opValue+1] = fp->fInputIdx;
5324 fActiveStart = fLookStart; // Set the match region change for
5325 fActiveLimit = fLookLimit; // transparent bounds.
5326 }
5327 break;
5328
5329 case URX_LA_END:
5330 {
5331 // Leaving a look-ahead block.
5332 // restore Stack Ptr, Input Pos to positions they had on entry to block.
5333 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5334 int32_t stackSize = fStack->size();
5335 int32_t newStackSize = (int32_t)fData[opValue];
5336 U_ASSERT(stackSize >= newStackSize);
5337 if (stackSize > newStackSize) {
5338 // Copy the current top frame back to the new (cut back) top frame.
5339 // This makes the capture groups from within the look-ahead
5340 // expression available.
5341 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
5342 int32_t i;
5343 for (i=0; i<fFrameSize; i++) {
5344 newFP[i] = ((int64_t *)fp)[i];
5345 }
5346 fp = (REStackFrame *)newFP;
5347 fStack->setSize(newStackSize);
5348 }
5349 fp->fInputIdx = fData[opValue+1];
5350
5351 // Restore the active region bounds in the input string; they may have
5352 // been changed because of transparent bounds on a Region.
5353 fActiveStart = fRegionStart;
5354 fActiveLimit = fRegionLimit;
5355 }
5356 break;
5357
5358 case URX_ONECHAR_I:
5359 if (fp->fInputIdx < fActiveLimit) {
5360 UChar32 c;
5361 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
5362 if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) {
5363 break;
5364 }
5365 } else {
5366 fHitEnd = TRUE;
5367 }
5368 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5369 break;
5370
5371 case URX_STRING_I:
5372 // Case-insensitive test input against a literal string.
5373 // Strings require two slots in the compiled pattern, one for the
5374 // offset to the string text, and one for the length.
5375 // The compiled string has already been case folded.
5376 {
5377 const UChar *patternString = litText + opValue;
5378
5379 op = (int32_t)pat[fp->fPatIdx];
5380 fp->fPatIdx++;
5381 opType = URX_TYPE(op);
5382 opValue = URX_VAL(op);
5383 U_ASSERT(opType == URX_STRING_LEN);
5384 int32_t patternStringLen = opValue; // Length of the string from the pattern.
5385
5386 UChar32 cText;
5387 UChar32 cPattern;
5388 UBool success = TRUE;
5389 int32_t patternStringIdx = 0;
5390 CaseFoldingUCharIterator inputIterator(inputBuf, fp->fInputIdx, fActiveLimit);
5391 while (patternStringIdx < patternStringLen) {
5392 U16_NEXT(patternString, patternStringIdx, patternStringLen, cPattern);
5393 cText = inputIterator.next();
5394 if (cText != cPattern) {
5395 success = FALSE;
5396 if (cText == U_SENTINEL) {
5397 fHitEnd = TRUE;
5398 }
5399 break;
5400 }
5401 }
5402 if (inputIterator.inExpansion()) {
5403 success = FALSE;
5404 }
5405
5406 if (success) {
5407 fp->fInputIdx = inputIterator.getIndex();
5408 } else {
5409 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5410 }
5411 }
5412 break;
5413
5414 case URX_LB_START:
5415 {
5416 // Entering a look-behind block.
5417 // Save Stack Ptr, Input Pos.
5418 // TODO: implement transparent bounds. Ticket #6067
5419 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5420 fData[opValue] = fStack->size();
5421 fData[opValue+1] = fp->fInputIdx;
5422 // Init the variable containing the start index for attempted matches.
5423 fData[opValue+2] = -1;
5424 // Save input string length, then reset to pin any matches to end at
5425 // the current position.
5426 fData[opValue+3] = fActiveLimit;
5427 fActiveLimit = fp->fInputIdx;
5428 }
5429 break;
5430
5431
5432 case URX_LB_CONT:
5433 {
5434 // Positive Look-Behind, at top of loop checking for matches of LB expression
5435 // at all possible input starting positions.
5436
5437 // Fetch the min and max possible match lengths. They are the operands
5438 // of this op in the pattern.
5439 int32_t minML = (int32_t)pat[fp->fPatIdx++];
5440 int32_t maxML = (int32_t)pat[fp->fPatIdx++];
5441 U_ASSERT(minML <= maxML);
5442 U_ASSERT(minML >= 0);
5443
5444 // Fetch (from data) the last input index where a match was attempted.
5445 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5446 int64_t &lbStartIdx = fData[opValue+2];
5447 if (lbStartIdx < 0) {
5448 // First time through loop.
5449 lbStartIdx = fp->fInputIdx - minML;
5450 if (lbStartIdx > 0) {
5451 U16_SET_CP_START(inputBuf, 0, lbStartIdx);
5452 }
5453 } else {
5454 // 2nd through nth time through the loop.
5455 // Back up start position for match by one.
5456 if (lbStartIdx == 0) {
5457 lbStartIdx--;
5458 } else {
5459 U16_BACK_1(inputBuf, 0, lbStartIdx);
5460 }
5461 }
5462
5463 if (lbStartIdx < 0 || lbStartIdx < fp->fInputIdx - maxML) {
5464 // We have tried all potential match starting points without
5465 // getting a match. Backtrack out, and out of the
5466 // Look Behind altogether.
5467 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5468 int64_t restoreInputLen = fData[opValue+3];
5469 U_ASSERT(restoreInputLen >= fActiveLimit);
5470 U_ASSERT(restoreInputLen <= fInputLength);
5471 fActiveLimit = restoreInputLen;
5472 break;
5473 }
5474
5475 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
5476 // (successful match will fall off the end of the loop.)
5477 fp = StateSave(fp, fp->fPatIdx-3, status);
5478 fp->fInputIdx = lbStartIdx;
5479 }
5480 break;
5481
5482 case URX_LB_END:
5483 // End of a look-behind block, after a successful match.
5484 {
5485 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5486 if (fp->fInputIdx != fActiveLimit) {
5487 // The look-behind expression matched, but the match did not
5488 // extend all the way to the point that we are looking behind from.
5489 // FAIL out of here, which will take us back to the LB_CONT, which
5490 // will retry the match starting at another position or fail
5491 // the look-behind altogether, whichever is appropriate.
5492 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5493 break;
5494 }
5495
5496 // Look-behind match is good. Restore the orignal input string length,
5497 // which had been truncated to pin the end of the lookbehind match to the
5498 // position being looked-behind.
5499 int64_t originalInputLen = fData[opValue+3];
5500 U_ASSERT(originalInputLen >= fActiveLimit);
5501 U_ASSERT(originalInputLen <= fInputLength);
5502 fActiveLimit = originalInputLen;
5503 }
5504 break;
5505
5506
5507 case URX_LBN_CONT:
5508 {
5509 // Negative Look-Behind, at top of loop checking for matches of LB expression
5510 // at all possible input starting positions.
5511
5512 // Fetch the extra parameters of this op.
5513 int32_t minML = (int32_t)pat[fp->fPatIdx++];
5514 int32_t maxML = (int32_t)pat[fp->fPatIdx++];
5515 int32_t continueLoc = (int32_t)pat[fp->fPatIdx++];
5516 continueLoc = URX_VAL(continueLoc);
5517 U_ASSERT(minML <= maxML);
5518 U_ASSERT(minML >= 0);
5519 U_ASSERT(continueLoc > fp->fPatIdx);
5520
5521 // Fetch (from data) the last input index where a match was attempted.
5522 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5523 int64_t &lbStartIdx = fData[opValue+2];
5524 if (lbStartIdx < 0) {
5525 // First time through loop.
5526 lbStartIdx = fp->fInputIdx - minML;
5527 if (lbStartIdx > 0) {
5528 U16_SET_CP_START(inputBuf, 0, lbStartIdx);
5529 }
5530 } else {
5531 // 2nd through nth time through the loop.
5532 // Back up start position for match by one.
5533 if (lbStartIdx == 0) {
5534 lbStartIdx--; // Because U16_BACK is unsafe starting at 0.
5535 } else {
5536 U16_BACK_1(inputBuf, 0, lbStartIdx);
5537 }
5538 }
5539
5540 if (lbStartIdx < 0 || lbStartIdx < fp->fInputIdx - maxML) {
5541 // We have tried all potential match starting points without
5542 // getting a match, which means that the negative lookbehind as
5543 // a whole has succeeded. Jump forward to the continue location
5544 int64_t restoreInputLen = fData[opValue+3];
5545 U_ASSERT(restoreInputLen >= fActiveLimit);
5546 U_ASSERT(restoreInputLen <= fInputLength);
5547 fActiveLimit = restoreInputLen;
5548 fp->fPatIdx = continueLoc;
5549 break;
5550 }
5551
5552 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
5553 // (successful match will cause a FAIL out of the loop altogether.)
5554 fp = StateSave(fp, fp->fPatIdx-4, status);
5555 fp->fInputIdx = lbStartIdx;
5556 }
5557 break;
5558
5559 case URX_LBN_END:
5560 // End of a negative look-behind block, after a successful match.
5561 {
5562 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5563 if (fp->fInputIdx != fActiveLimit) {
5564 // The look-behind expression matched, but the match did not
5565 // extend all the way to the point that we are looking behind from.
5566 // FAIL out of here, which will take us back to the LB_CONT, which
5567 // will retry the match starting at another position or succeed
5568 // the look-behind altogether, whichever is appropriate.
5569 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5570 break;
5571 }
5572
5573 // Look-behind expression matched, which means look-behind test as
5574 // a whole Fails
5575
5576 // Restore the orignal input string length, which had been truncated
5577 // inorder to pin the end of the lookbehind match
5578 // to the position being looked-behind.
5579 int64_t originalInputLen = fData[opValue+3];
5580 U_ASSERT(originalInputLen >= fActiveLimit);
5581 U_ASSERT(originalInputLen <= fInputLength);
5582 fActiveLimit = originalInputLen;
5583
5584 // Restore original stack position, discarding any state saved
5585 // by the successful pattern match.
5586 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5587 int32_t newStackSize = (int32_t)fData[opValue];
5588 U_ASSERT(fStack->size() > newStackSize);
5589 fStack->setSize(newStackSize);
5590
5591 // FAIL, which will take control back to someplace
5592 // prior to entering the look-behind test.
5593 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5594 }
5595 break;
5596
5597
5598 case URX_LOOP_SR_I:
5599 // Loop Initialization for the optimized implementation of
5600 // [some character set]*
5601 // This op scans through all matching input.
5602 // The following LOOP_C op emulates stack unwinding if the following pattern fails.
5603 {
5604 U_ASSERT(opValue > 0 && opValue < sets->size());
5605 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
5606 UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue);
5607
5608 // Loop through input, until either the input is exhausted or
5609 // we reach a character that is not a member of the set.
5610 int32_t ix = (int32_t)fp->fInputIdx;
5611 for (;;) {
5612 if (ix >= fActiveLimit) {
5613 fHitEnd = TRUE;
5614 break;
5615 }
5616 UChar32 c;
5617 U16_NEXT(inputBuf, ix, fActiveLimit, c);
5618 if (c<256) {
5619 if (s8->contains(c) == FALSE) {
5620 U16_BACK_1(inputBuf, 0, ix);
5621 break;
5622 }
5623 } else {
5624 if (s->contains(c) == FALSE) {
5625 U16_BACK_1(inputBuf, 0, ix);
5626 break;
5627 }
5628 }
5629 }
5630
5631 // If there were no matching characters, skip over the loop altogether.
5632 // The loop doesn't run at all, a * op always succeeds.
5633 if (ix == fp->fInputIdx) {
5634 fp->fPatIdx++; // skip the URX_LOOP_C op.
5635 break;
5636 }
5637
5638 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
5639 // must follow. It's operand is the stack location
5640 // that holds the starting input index for the match of this [set]*
5641 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
5642 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
5643 int32_t stackLoc = URX_VAL(loopcOp);
5644 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
5645 fp->fExtra[stackLoc] = fp->fInputIdx;
5646 fp->fInputIdx = ix;
5647
5648 // Save State to the URX_LOOP_C op that follows this one,
5649 // so that match failures in the following code will return to there.
5650 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
5651 fp = StateSave(fp, fp->fPatIdx, status);
5652 fp->fPatIdx++;
5653 }
5654 break;
5655
5656
5657 case URX_LOOP_DOT_I:
5658 // Loop Initialization for the optimized implementation of .*
5659 // This op scans through all remaining input.
5660 // The following LOOP_C op emulates stack unwinding if the following pattern fails.
5661 {
5662 // Loop through input until the input is exhausted (we reach an end-of-line)
5663 // In DOTALL mode, we can just go straight to the end of the input.
5664 int32_t ix;
5665 if ((opValue & 1) == 1) {
5666 // Dot-matches-All mode. Jump straight to the end of the string.
5667 ix = (int32_t)fActiveLimit;
5668 fHitEnd = TRUE;
5669 } else {
5670 // NOT DOT ALL mode. Line endings do not match '.'
5671 // Scan forward until a line ending or end of input.
5672 ix = (int32_t)fp->fInputIdx;
5673 for (;;) {
5674 if (ix >= fActiveLimit) {
5675 fHitEnd = TRUE;
5676 break;
5677 }
5678 UChar32 c;
5679 U16_NEXT(inputBuf, ix, fActiveLimit, c); // c = inputBuf[ix++]
5680 if ((c & 0x7f) <= 0x29) { // Fast filter of non-new-line-s
5681 if ((c == 0x0a) || // 0x0a is newline in both modes.
5682 (((opValue & 2) == 0) && // IF not UNIX_LINES mode
5683 isLineTerminator(c))) {
5684 // char is a line ending. Put the input pos back to the
5685 // line ending char, and exit the scanning loop.
5686 U16_BACK_1(inputBuf, 0, ix);
5687 break;
5688 }
5689 }
5690 }
5691 }
5692
5693 // If there were no matching characters, skip over the loop altogether.
5694 // The loop doesn't run at all, a * op always succeeds.
5695 if (ix == fp->fInputIdx) {
5696 fp->fPatIdx++; // skip the URX_LOOP_C op.
5697 break;
5698 }
5699
5700 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
5701 // must follow. It's operand is the stack location
5702 // that holds the starting input index for the match of this .*
5703 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
5704 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
5705 int32_t stackLoc = URX_VAL(loopcOp);
5706 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
5707 fp->fExtra[stackLoc] = fp->fInputIdx;
5708 fp->fInputIdx = ix;
5709
5710 // Save State to the URX_LOOP_C op that follows this one,
5711 // so that match failures in the following code will return to there.
5712 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
5713 fp = StateSave(fp, fp->fPatIdx, status);
5714 fp->fPatIdx++;
5715 }
5716 break;
5717
5718
5719 case URX_LOOP_C:
5720 {
5721 U_ASSERT(opValue>=0 && opValue<fFrameSize);
5722 backSearchIndex = (int32_t)fp->fExtra[opValue];
5723 U_ASSERT(backSearchIndex <= fp->fInputIdx);
5724 if (backSearchIndex == fp->fInputIdx) {
5725 // We've backed up the input idx to the point that the loop started.
5726 // The loop is done. Leave here without saving state.
5727 // Subsequent failures won't come back here.
5728 break;
5729 }
5730 // Set up for the next iteration of the loop, with input index
5731 // backed up by one from the last time through,
5732 // and a state save to this instruction in case the following code fails again.
5733 // (We're going backwards because this loop emulates stack unwinding, not
5734 // the initial scan forward.)
5735 U_ASSERT(fp->fInputIdx > 0);
5736 UChar32 prevC;
5737 U16_PREV(inputBuf, 0, fp->fInputIdx, prevC); // !!!: should this 0 be one of f*Limit?
5738
5739 if (prevC == 0x0a &&
5740 fp->fInputIdx > backSearchIndex &&
5741 inputBuf[fp->fInputIdx-1] == 0x0d) {
5742 int32_t prevOp = (int32_t)pat[fp->fPatIdx-2];
5743 if (URX_TYPE(prevOp) == URX_LOOP_DOT_I) {
5744 // .*, stepping back over CRLF pair.
5745 U16_BACK_1(inputBuf, 0, fp->fInputIdx);
5746 }
5747 }
5748
5749
5750 fp = StateSave(fp, fp->fPatIdx-1, status);
5751 }
5752 break;
5753
5754
5755
5756 default:
5757 // Trouble. The compiled pattern contains an entry with an
5758 // unrecognized type tag.
5759 U_ASSERT(FALSE);
5760 }
5761
5762 if (U_FAILURE(status)) {
5763 isMatch = FALSE;
5764 break;
5765 }
5766 }
5767
5768 breakFromLoop:
5769 fMatch = isMatch;
5770 if (isMatch) {
5771 fLastMatchEnd = fMatchEnd;
5772 fMatchStart = startIdx;
5773 fMatchEnd = fp->fInputIdx;
5774 }
5775
5776 #ifdef REGEX_RUN_DEBUG
5777 if (fTraceDebug) {
5778 if (isMatch) {
5779 printf("Match. start=%ld end=%ld\n\n", fMatchStart, fMatchEnd);
5780 } else {
5781 printf("No match\n\n");
5782 }
5783 }
5784 #endif
5785
5786 fFrame = fp; // The active stack frame when the engine stopped.
5787 // Contains the capture group results that we need to
5788 // access later.
5789
5790 return;
5791 }
5792
5793
5794 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RegexMatcher)
5795
5796 U_NAMESPACE_END
5797
5798 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS
5799