1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 **************************************************************************
5 * Copyright (C) 2002-2016 International Business Machines Corporation
6 * and others. All rights reserved.
7 **************************************************************************
8 */
9 //
10 // file: rematch.cpp
11 //
12 // Contains the implementation of class RegexMatcher,
13 // which is one of the main API classes for the ICU regular expression package.
14 //
15
16 #include "unicode/utypes.h"
17 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
18
19 #include "unicode/regex.h"
20 #include "unicode/uniset.h"
21 #include "unicode/uchar.h"
22 #include "unicode/ustring.h"
23 #include "unicode/rbbi.h"
24 #include "unicode/utf.h"
25 #include "unicode/utf16.h"
26 #include "uassert.h"
27 #include "cmemory.h"
28 #include "cstr.h"
29 #include "uvector.h"
30 #include "uvectr32.h"
31 #include "uvectr64.h"
32 #include "regeximp.h"
33 #include "regexst.h"
34 #include "regextxt.h"
35 #include "ucase.h"
36
37 // #include <malloc.h> // Needed for heapcheck testing
38
39
40 U_NAMESPACE_BEGIN
41
42 // Default limit for the size of the back track stack, to avoid system
43 // failures causedby heap exhaustion. Units are in 32 bit words, not bytes.
44 // This value puts ICU's limits higher than most other regexp implementations,
45 // which use recursion rather than the heap, and take more storage per
46 // backtrack point.
47 //
48 static const int32_t DEFAULT_BACKTRACK_STACK_CAPACITY = 8000000;
49
50 // Time limit counter constant.
51 // Time limits for expression evaluation are in terms of quanta of work by
52 // the engine, each of which is 10,000 state saves.
53 // This constant determines that state saves per tick number.
54 static const int32_t TIMER_INITIAL_VALUE = 10000;
55
56
57 // Test for any of the Unicode line terminating characters.
isLineTerminator(UChar32 c)58 static inline UBool isLineTerminator(UChar32 c) {
59 if (c & ~(0x0a | 0x0b | 0x0c | 0x0d | 0x85 | 0x2028 | 0x2029)) {
60 return false;
61 }
62 return (c<=0x0d && c>=0x0a) || c==0x85 || c==0x2028 || c==0x2029;
63 }
64
65 //-----------------------------------------------------------------------------
66 //
67 // Constructor and Destructor
68 //
69 //-----------------------------------------------------------------------------
RegexMatcher(const RegexPattern * pat)70 RegexMatcher::RegexMatcher(const RegexPattern *pat) {
71 fDeferredStatus = U_ZERO_ERROR;
72 init(fDeferredStatus);
73 if (U_FAILURE(fDeferredStatus)) {
74 return;
75 }
76 if (pat==nullptr) {
77 fDeferredStatus = U_ILLEGAL_ARGUMENT_ERROR;
78 return;
79 }
80 fPattern = pat;
81 init2(RegexStaticSets::gStaticSets->fEmptyText, fDeferredStatus);
82 }
83
84
85
RegexMatcher(const UnicodeString & regexp,const UnicodeString & input,uint32_t flags,UErrorCode & status)86 RegexMatcher::RegexMatcher(const UnicodeString ®exp, const UnicodeString &input,
87 uint32_t flags, UErrorCode &status) {
88 init(status);
89 if (U_FAILURE(status)) {
90 return;
91 }
92 UParseError pe;
93 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status);
94 fPattern = fPatternOwned;
95
96 UText inputText = UTEXT_INITIALIZER;
97 utext_openConstUnicodeString(&inputText, &input, &status);
98 init2(&inputText, status);
99 utext_close(&inputText);
100
101 fInputUniStrMaybeMutable = true;
102 }
103
104
RegexMatcher(UText * regexp,UText * input,uint32_t flags,UErrorCode & status)105 RegexMatcher::RegexMatcher(UText *regexp, UText *input,
106 uint32_t flags, UErrorCode &status) {
107 init(status);
108 if (U_FAILURE(status)) {
109 return;
110 }
111 UParseError pe;
112 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status);
113 if (U_FAILURE(status)) {
114 return;
115 }
116
117 fPattern = fPatternOwned;
118 init2(input, status);
119 }
120
121
RegexMatcher(const UnicodeString & regexp,uint32_t flags,UErrorCode & status)122 RegexMatcher::RegexMatcher(const UnicodeString ®exp,
123 uint32_t flags, UErrorCode &status) {
124 init(status);
125 if (U_FAILURE(status)) {
126 return;
127 }
128 UParseError pe;
129 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status);
130 if (U_FAILURE(status)) {
131 return;
132 }
133 fPattern = fPatternOwned;
134 init2(RegexStaticSets::gStaticSets->fEmptyText, status);
135 }
136
RegexMatcher(UText * regexp,uint32_t flags,UErrorCode & status)137 RegexMatcher::RegexMatcher(UText *regexp,
138 uint32_t flags, UErrorCode &status) {
139 init(status);
140 if (U_FAILURE(status)) {
141 return;
142 }
143 UParseError pe;
144 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status);
145 if (U_FAILURE(status)) {
146 return;
147 }
148
149 fPattern = fPatternOwned;
150 init2(RegexStaticSets::gStaticSets->fEmptyText, status);
151 }
152
153
154
155
~RegexMatcher()156 RegexMatcher::~RegexMatcher() {
157 delete fStack;
158 if (fData != fSmallData) {
159 uprv_free(fData);
160 fData = nullptr;
161 }
162 if (fPatternOwned) {
163 delete fPatternOwned;
164 fPatternOwned = nullptr;
165 fPattern = nullptr;
166 }
167
168 delete fInput;
169 if (fInputText) {
170 utext_close(fInputText);
171 }
172 if (fAltInputText) {
173 utext_close(fAltInputText);
174 }
175
176 #if UCONFIG_NO_BREAK_ITERATION==0
177 delete fWordBreakItr;
178 delete fGCBreakItr;
179 #endif
180 }
181
182 //
183 // init() common initialization for use by all constructors.
184 // Initialize all fields, get the object into a consistent state.
185 // This must be done even when the initial status shows an error,
186 // so that the object is initialized sufficiently well for the destructor
187 // to run safely.
188 //
init(UErrorCode & status)189 void RegexMatcher::init(UErrorCode &status) {
190 fPattern = nullptr;
191 fPatternOwned = nullptr;
192 fFrameSize = 0;
193 fRegionStart = 0;
194 fRegionLimit = 0;
195 fAnchorStart = 0;
196 fAnchorLimit = 0;
197 fLookStart = 0;
198 fLookLimit = 0;
199 fActiveStart = 0;
200 fActiveLimit = 0;
201 fTransparentBounds = false;
202 fAnchoringBounds = true;
203 fMatch = false;
204 fMatchStart = 0;
205 fMatchEnd = 0;
206 fLastMatchEnd = -1;
207 fAppendPosition = 0;
208 fHitEnd = false;
209 fRequireEnd = false;
210 fStack = nullptr;
211 fFrame = nullptr;
212 fTimeLimit = 0;
213 fTime = 0;
214 fTickCounter = 0;
215 fStackLimit = DEFAULT_BACKTRACK_STACK_CAPACITY;
216 fCallbackFn = nullptr;
217 fCallbackContext = nullptr;
218 fFindProgressCallbackFn = nullptr;
219 fFindProgressCallbackContext = nullptr;
220 fTraceDebug = false;
221 fDeferredStatus = status;
222 fData = fSmallData;
223 fWordBreakItr = nullptr;
224 fGCBreakItr = nullptr;
225
226 fStack = nullptr;
227 fInputText = nullptr;
228 fAltInputText = nullptr;
229 fInput = nullptr;
230 fInputLength = 0;
231 fInputUniStrMaybeMutable = false;
232 }
233
234 //
235 // init2() Common initialization for use by RegexMatcher constructors, part 2.
236 // This handles the common setup to be done after the Pattern is available.
237 //
init2(UText * input,UErrorCode & status)238 void RegexMatcher::init2(UText *input, UErrorCode &status) {
239 if (U_FAILURE(status)) {
240 fDeferredStatus = status;
241 return;
242 }
243
244 if (fPattern->fDataSize > UPRV_LENGTHOF(fSmallData)) {
245 fData = (int64_t *)uprv_malloc(fPattern->fDataSize * sizeof(int64_t));
246 if (fData == nullptr) {
247 status = fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
248 return;
249 }
250 }
251
252 fStack = new UVector64(status);
253 if (fStack == nullptr) {
254 status = fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
255 return;
256 }
257
258 reset(input);
259 setStackLimit(DEFAULT_BACKTRACK_STACK_CAPACITY, status);
260 if (U_FAILURE(status)) {
261 fDeferredStatus = status;
262 return;
263 }
264 }
265
266
267 static const char16_t BACKSLASH = 0x5c;
268 static const char16_t DOLLARSIGN = 0x24;
269 static const char16_t LEFTBRACKET = 0x7b;
270 static const char16_t RIGHTBRACKET = 0x7d;
271
272 //--------------------------------------------------------------------------------
273 //
274 // appendReplacement
275 //
276 //--------------------------------------------------------------------------------
appendReplacement(UnicodeString & dest,const UnicodeString & replacement,UErrorCode & status)277 RegexMatcher &RegexMatcher::appendReplacement(UnicodeString &dest,
278 const UnicodeString &replacement,
279 UErrorCode &status) {
280 UText replacementText = UTEXT_INITIALIZER;
281
282 utext_openConstUnicodeString(&replacementText, &replacement, &status);
283 if (U_SUCCESS(status)) {
284 UText resultText = UTEXT_INITIALIZER;
285 utext_openUnicodeString(&resultText, &dest, &status);
286
287 if (U_SUCCESS(status)) {
288 appendReplacement(&resultText, &replacementText, status);
289 utext_close(&resultText);
290 }
291 utext_close(&replacementText);
292 }
293
294 return *this;
295 }
296
297 //
298 // appendReplacement, UText mode
299 //
appendReplacement(UText * dest,UText * replacement,UErrorCode & status)300 RegexMatcher &RegexMatcher::appendReplacement(UText *dest,
301 UText *replacement,
302 UErrorCode &status) {
303 if (U_FAILURE(status)) {
304 return *this;
305 }
306 if (U_FAILURE(fDeferredStatus)) {
307 status = fDeferredStatus;
308 return *this;
309 }
310 if (fMatch == false) {
311 status = U_REGEX_INVALID_STATE;
312 return *this;
313 }
314
315 // Copy input string from the end of previous match to start of current match
316 int64_t destLen = utext_nativeLength(dest);
317 if (fMatchStart > fAppendPosition) {
318 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
319 destLen += utext_replace(dest, destLen, destLen, fInputText->chunkContents+fAppendPosition,
320 (int32_t)(fMatchStart-fAppendPosition), &status);
321 } else {
322 int32_t len16;
323 if (UTEXT_USES_U16(fInputText)) {
324 len16 = (int32_t)(fMatchStart-fAppendPosition);
325 } else {
326 UErrorCode lengthStatus = U_ZERO_ERROR;
327 len16 = utext_extract(fInputText, fAppendPosition, fMatchStart, nullptr, 0, &lengthStatus);
328 }
329 char16_t *inputChars = (char16_t *)uprv_malloc(sizeof(char16_t)*(len16+1));
330 if (inputChars == nullptr) {
331 status = U_MEMORY_ALLOCATION_ERROR;
332 return *this;
333 }
334 utext_extract(fInputText, fAppendPosition, fMatchStart, inputChars, len16+1, &status);
335 destLen += utext_replace(dest, destLen, destLen, inputChars, len16, &status);
336 uprv_free(inputChars);
337 }
338 }
339 fAppendPosition = fMatchEnd;
340
341
342 // scan the replacement text, looking for substitutions ($n) and \escapes.
343 // TODO: optimize this loop by efficiently scanning for '$' or '\',
344 // move entire ranges not containing substitutions.
345 UTEXT_SETNATIVEINDEX(replacement, 0);
346 for (UChar32 c = UTEXT_NEXT32(replacement); U_SUCCESS(status) && c != U_SENTINEL; c = UTEXT_NEXT32(replacement)) {
347 if (c == BACKSLASH) {
348 // Backslash Escape. Copy the following char out without further checks.
349 // Note: Surrogate pairs don't need any special handling
350 // The second half wont be a '$' or a '\', and
351 // will move to the dest normally on the next
352 // loop iteration.
353 c = UTEXT_CURRENT32(replacement);
354 if (c == U_SENTINEL) {
355 break;
356 }
357
358 if (c==0x55/*U*/ || c==0x75/*u*/) {
359 // We have a \udddd or \Udddddddd escape sequence.
360 int32_t offset = 0;
361 struct URegexUTextUnescapeCharContext context = U_REGEX_UTEXT_UNESCAPE_CONTEXT(replacement);
362 UChar32 escapedChar = u_unescapeAt(uregex_utext_unescape_charAt, &offset, INT32_MAX, &context);
363 if (escapedChar != (UChar32)0xFFFFFFFF) {
364 if (U_IS_BMP(escapedChar)) {
365 char16_t c16 = (char16_t)escapedChar;
366 destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
367 } else {
368 char16_t surrogate[2];
369 surrogate[0] = U16_LEAD(escapedChar);
370 surrogate[1] = U16_TRAIL(escapedChar);
371 if (U_SUCCESS(status)) {
372 destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);
373 }
374 }
375 // TODO: Report errors for mal-formed \u escapes?
376 // As this is, the original sequence is output, which may be OK.
377 if (context.lastOffset == offset) {
378 (void)UTEXT_PREVIOUS32(replacement);
379 } else if (context.lastOffset != offset-1) {
380 utext_moveIndex32(replacement, offset - context.lastOffset - 1);
381 }
382 }
383 } else {
384 (void)UTEXT_NEXT32(replacement);
385 // Plain backslash escape. Just put out the escaped character.
386 if (U_IS_BMP(c)) {
387 char16_t c16 = (char16_t)c;
388 destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
389 } else {
390 char16_t surrogate[2];
391 surrogate[0] = U16_LEAD(c);
392 surrogate[1] = U16_TRAIL(c);
393 if (U_SUCCESS(status)) {
394 destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);
395 }
396 }
397 }
398 } else if (c != DOLLARSIGN) {
399 // Normal char, not a $. Copy it out without further checks.
400 if (U_IS_BMP(c)) {
401 char16_t c16 = (char16_t)c;
402 destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
403 } else {
404 char16_t surrogate[2];
405 surrogate[0] = U16_LEAD(c);
406 surrogate[1] = U16_TRAIL(c);
407 if (U_SUCCESS(status)) {
408 destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);
409 }
410 }
411 } else {
412 // We've got a $. Pick up a capture group name or number if one follows.
413 // Consume digits so long as the resulting group number <= the number of
414 // number of capture groups in the pattern.
415
416 int32_t groupNum = 0;
417 int32_t numDigits = 0;
418 UChar32 nextChar = utext_current32(replacement);
419 if (nextChar == LEFTBRACKET) {
420 // Scan for a Named Capture Group, ${name}.
421 UnicodeString groupName;
422 utext_next32(replacement);
423 while(U_SUCCESS(status) && nextChar != RIGHTBRACKET) {
424 nextChar = utext_next32(replacement);
425 if (nextChar == U_SENTINEL) {
426 status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
427 } else if ((nextChar >= 0x41 && nextChar <= 0x5a) || // A..Z
428 (nextChar >= 0x61 && nextChar <= 0x7a) || // a..z
429 (nextChar >= 0x31 && nextChar <= 0x39)) { // 0..9
430 groupName.append(nextChar);
431 } else if (nextChar == RIGHTBRACKET) {
432 groupNum = fPattern->fNamedCaptureMap ? uhash_geti(fPattern->fNamedCaptureMap, &groupName) : 0;
433 if (groupNum == 0) {
434 status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
435 }
436 } else {
437 // Character was something other than a name char or a closing '}'
438 status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
439 }
440 }
441
442 } else if (u_isdigit(nextChar)) {
443 // $n Scan for a capture group number
444 int32_t numCaptureGroups = fPattern->fGroupMap->size();
445 for (;;) {
446 nextChar = UTEXT_CURRENT32(replacement);
447 if (nextChar == U_SENTINEL) {
448 break;
449 }
450 if (u_isdigit(nextChar) == false) {
451 break;
452 }
453 int32_t nextDigitVal = u_charDigitValue(nextChar);
454 if (groupNum*10 + nextDigitVal > numCaptureGroups) {
455 // Don't consume the next digit if it makes the capture group number too big.
456 if (numDigits == 0) {
457 status = U_INDEX_OUTOFBOUNDS_ERROR;
458 }
459 break;
460 }
461 (void)UTEXT_NEXT32(replacement);
462 groupNum=groupNum*10 + nextDigitVal;
463 ++numDigits;
464 }
465 } else {
466 // $ not followed by capture group name or number.
467 status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
468 }
469
470 if (U_SUCCESS(status)) {
471 destLen += appendGroup(groupNum, dest, status);
472 }
473 } // End of $ capture group handling
474 } // End of per-character loop through the replacement string.
475
476 return *this;
477 }
478
479
480
481 //--------------------------------------------------------------------------------
482 //
483 // appendTail Intended to be used in conjunction with appendReplacement()
484 // To the destination string, append everything following
485 // the last match position from the input string.
486 //
487 // Note: Match ranges do not affect appendTail or appendReplacement
488 //
489 //--------------------------------------------------------------------------------
appendTail(UnicodeString & dest)490 UnicodeString &RegexMatcher::appendTail(UnicodeString &dest) {
491 UErrorCode status = U_ZERO_ERROR;
492 UText resultText = UTEXT_INITIALIZER;
493 utext_openUnicodeString(&resultText, &dest, &status);
494
495 if (U_SUCCESS(status)) {
496 appendTail(&resultText, status);
497 utext_close(&resultText);
498 }
499
500 return dest;
501 }
502
503 //
504 // appendTail, UText mode
505 //
appendTail(UText * dest,UErrorCode & status)506 UText *RegexMatcher::appendTail(UText *dest, UErrorCode &status) {
507 if (U_FAILURE(status)) {
508 return dest;
509 }
510 if (U_FAILURE(fDeferredStatus)) {
511 status = fDeferredStatus;
512 return dest;
513 }
514
515 if (fInputLength > fAppendPosition) {
516 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
517 int64_t destLen = utext_nativeLength(dest);
518 utext_replace(dest, destLen, destLen, fInputText->chunkContents+fAppendPosition,
519 (int32_t)(fInputLength-fAppendPosition), &status);
520 } else {
521 int32_t len16;
522 if (UTEXT_USES_U16(fInputText)) {
523 len16 = (int32_t)(fInputLength-fAppendPosition);
524 } else {
525 len16 = utext_extract(fInputText, fAppendPosition, fInputLength, nullptr, 0, &status);
526 status = U_ZERO_ERROR; // buffer overflow
527 }
528
529 char16_t *inputChars = (char16_t *)uprv_malloc(sizeof(char16_t)*(len16));
530 if (inputChars == nullptr) {
531 fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
532 } else {
533 utext_extract(fInputText, fAppendPosition, fInputLength, inputChars, len16, &status); // unterminated
534 int64_t destLen = utext_nativeLength(dest);
535 utext_replace(dest, destLen, destLen, inputChars, len16, &status);
536 uprv_free(inputChars);
537 }
538 }
539 }
540 return dest;
541 }
542
543
544
545 //--------------------------------------------------------------------------------
546 //
547 // end
548 //
549 //--------------------------------------------------------------------------------
end(UErrorCode & err) const550 int32_t RegexMatcher::end(UErrorCode &err) const {
551 return end(0, err);
552 }
553
end64(UErrorCode & err) const554 int64_t RegexMatcher::end64(UErrorCode &err) const {
555 return end64(0, err);
556 }
557
end64(int32_t group,UErrorCode & err) const558 int64_t RegexMatcher::end64(int32_t group, UErrorCode &err) const {
559 if (U_FAILURE(err)) {
560 return -1;
561 }
562 if (fMatch == false) {
563 err = U_REGEX_INVALID_STATE;
564 return -1;
565 }
566 if (group < 0 || group > fPattern->fGroupMap->size()) {
567 err = U_INDEX_OUTOFBOUNDS_ERROR;
568 return -1;
569 }
570 int64_t e = -1;
571 if (group == 0) {
572 e = fMatchEnd;
573 } else {
574 // Get the position within the stack frame of the variables for
575 // this capture group.
576 int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1);
577 U_ASSERT(groupOffset < fPattern->fFrameSize);
578 U_ASSERT(groupOffset >= 0);
579 e = fFrame->fExtra[groupOffset + 1];
580 }
581
582 return e;
583 }
584
end(int32_t group,UErrorCode & err) const585 int32_t RegexMatcher::end(int32_t group, UErrorCode &err) const {
586 return (int32_t)end64(group, err);
587 }
588
589 //--------------------------------------------------------------------------------
590 //
591 // findProgressInterrupt This function is called once for each advance in the target
592 // string from the find() function, and calls the user progress callback
593 // function if there is one installed.
594 //
595 // Return: true if the find operation is to be terminated.
596 // false if the find operation is to continue running.
597 //
598 //--------------------------------------------------------------------------------
findProgressInterrupt(int64_t pos,UErrorCode & status)599 UBool RegexMatcher::findProgressInterrupt(int64_t pos, UErrorCode &status) {
600 if (fFindProgressCallbackFn && !(*fFindProgressCallbackFn)(fFindProgressCallbackContext, pos)) {
601 status = U_REGEX_STOPPED_BY_CALLER;
602 return true;
603 }
604 return false;
605 }
606
607 //--------------------------------------------------------------------------------
608 //
609 // find()
610 //
611 //--------------------------------------------------------------------------------
find()612 UBool RegexMatcher::find() {
613 if (U_FAILURE(fDeferredStatus)) {
614 return false;
615 }
616 UErrorCode status = U_ZERO_ERROR;
617 UBool result = find(status);
618 return result;
619 }
620
621 //--------------------------------------------------------------------------------
622 //
623 // find()
624 //
625 //--------------------------------------------------------------------------------
find(UErrorCode & status)626 UBool RegexMatcher::find(UErrorCode &status) {
627 // Start at the position of the last match end. (Will be zero if the
628 // matcher has been reset.)
629 //
630 if (U_FAILURE(status)) {
631 return false;
632 }
633 if (U_FAILURE(fDeferredStatus)) {
634 status = fDeferredStatus;
635 return false;
636 }
637
638 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
639 return findUsingChunk(status);
640 }
641
642 int64_t startPos = fMatchEnd;
643 if (startPos==0) {
644 startPos = fActiveStart;
645 }
646
647 if (fMatch) {
648 // Save the position of any previous successful match.
649 fLastMatchEnd = fMatchEnd;
650
651 if (fMatchStart == fMatchEnd) {
652 // Previous match had zero length. Move start position up one position
653 // to avoid sending find() into a loop on zero-length matches.
654 if (startPos >= fActiveLimit) {
655 fMatch = false;
656 fHitEnd = true;
657 return false;
658 }
659 UTEXT_SETNATIVEINDEX(fInputText, startPos);
660 (void)UTEXT_NEXT32(fInputText);
661 startPos = UTEXT_GETNATIVEINDEX(fInputText);
662 }
663 } else {
664 if (fLastMatchEnd >= 0) {
665 // A previous find() failed to match. Don't try again.
666 // (without this test, a pattern with a zero-length match
667 // could match again at the end of an input string.)
668 fHitEnd = true;
669 return false;
670 }
671 }
672
673
674 // Compute the position in the input string beyond which a match can not begin, because
675 // the minimum length match would extend past the end of the input.
676 // Note: some patterns that cannot match anything will have fMinMatchLength==Max Int.
677 // Be aware of possible overflows if making changes here.
678 int64_t testStartLimit;
679 if (UTEXT_USES_U16(fInputText)) {
680 testStartLimit = fActiveLimit - fPattern->fMinMatchLen;
681 if (startPos > testStartLimit) {
682 fMatch = false;
683 fHitEnd = true;
684 return false;
685 }
686 } else {
687 // We don't know exactly how long the minimum match length is in native characters.
688 // Treat anything > 0 as 1.
689 testStartLimit = fActiveLimit - (fPattern->fMinMatchLen > 0 ? 1 : 0);
690 }
691
692 UChar32 c;
693 U_ASSERT(startPos >= 0);
694
695 switch (fPattern->fStartType) {
696 case START_NO_INFO:
697 // No optimization was found.
698 // Try a match at each input position.
699 for (;;) {
700 MatchAt(startPos, false, status);
701 if (U_FAILURE(status)) {
702 return false;
703 }
704 if (fMatch) {
705 return true;
706 }
707 if (startPos >= testStartLimit) {
708 fHitEnd = true;
709 return false;
710 }
711 UTEXT_SETNATIVEINDEX(fInputText, startPos);
712 (void)UTEXT_NEXT32(fInputText);
713 startPos = UTEXT_GETNATIVEINDEX(fInputText);
714 // Note that it's perfectly OK for a pattern to have a zero-length
715 // match at the end of a string, so we must make sure that the loop
716 // runs with startPos == testStartLimit the last time through.
717 if (findProgressInterrupt(startPos, status))
718 return false;
719 }
720 UPRV_UNREACHABLE_EXIT;
721
722 case START_START:
723 // Matches are only possible at the start of the input string
724 // (pattern begins with ^ or \A)
725 if (startPos > fActiveStart) {
726 fMatch = false;
727 return false;
728 }
729 MatchAt(startPos, false, status);
730 if (U_FAILURE(status)) {
731 return false;
732 }
733 return fMatch;
734
735
736 case START_SET:
737 {
738 // Match may start on any char from a pre-computed set.
739 U_ASSERT(fPattern->fMinMatchLen > 0);
740 UTEXT_SETNATIVEINDEX(fInputText, startPos);
741 for (;;) {
742 int64_t pos = startPos;
743 c = UTEXT_NEXT32(fInputText);
744 startPos = UTEXT_GETNATIVEINDEX(fInputText);
745 // c will be -1 (U_SENTINEL) at end of text, in which case we
746 // skip this next block (so we don't have a negative array index)
747 // and handle end of text in the following block.
748 if (c >= 0 && ((c<256 && fPattern->fInitialChars8->contains(c)) ||
749 (c>=256 && fPattern->fInitialChars->contains(c)))) {
750 MatchAt(pos, false, status);
751 if (U_FAILURE(status)) {
752 return false;
753 }
754 if (fMatch) {
755 return true;
756 }
757 UTEXT_SETNATIVEINDEX(fInputText, pos);
758 }
759 if (startPos > testStartLimit) {
760 fMatch = false;
761 fHitEnd = true;
762 return false;
763 }
764 if (findProgressInterrupt(startPos, status))
765 return false;
766 }
767 }
768 UPRV_UNREACHABLE_EXIT;
769
770 case START_STRING:
771 case START_CHAR:
772 {
773 // Match starts on exactly one char.
774 U_ASSERT(fPattern->fMinMatchLen > 0);
775 UChar32 theChar = fPattern->fInitialChar;
776 UTEXT_SETNATIVEINDEX(fInputText, startPos);
777 for (;;) {
778 int64_t pos = startPos;
779 c = UTEXT_NEXT32(fInputText);
780 startPos = UTEXT_GETNATIVEINDEX(fInputText);
781 if (c == theChar) {
782 MatchAt(pos, false, status);
783 if (U_FAILURE(status)) {
784 return false;
785 }
786 if (fMatch) {
787 return true;
788 }
789 UTEXT_SETNATIVEINDEX(fInputText, startPos);
790 }
791 if (startPos > testStartLimit) {
792 fMatch = false;
793 fHitEnd = true;
794 return false;
795 }
796 if (findProgressInterrupt(startPos, status))
797 return false;
798 }
799 }
800 UPRV_UNREACHABLE_EXIT;
801
802 case START_LINE:
803 {
804 UChar32 ch;
805 if (startPos == fAnchorStart) {
806 MatchAt(startPos, false, status);
807 if (U_FAILURE(status)) {
808 return false;
809 }
810 if (fMatch) {
811 return true;
812 }
813 UTEXT_SETNATIVEINDEX(fInputText, startPos);
814 ch = UTEXT_NEXT32(fInputText);
815 startPos = UTEXT_GETNATIVEINDEX(fInputText);
816 } else {
817 UTEXT_SETNATIVEINDEX(fInputText, startPos);
818 ch = UTEXT_PREVIOUS32(fInputText);
819 UTEXT_SETNATIVEINDEX(fInputText, startPos);
820 }
821
822 if (fPattern->fFlags & UREGEX_UNIX_LINES) {
823 for (;;) {
824 if (ch == 0x0a) {
825 MatchAt(startPos, false, status);
826 if (U_FAILURE(status)) {
827 return false;
828 }
829 if (fMatch) {
830 return true;
831 }
832 UTEXT_SETNATIVEINDEX(fInputText, startPos);
833 }
834 if (startPos >= testStartLimit) {
835 fMatch = false;
836 fHitEnd = true;
837 return false;
838 }
839 ch = UTEXT_NEXT32(fInputText);
840 startPos = UTEXT_GETNATIVEINDEX(fInputText);
841 // Note that it's perfectly OK for a pattern to have a zero-length
842 // match at the end of a string, so we must make sure that the loop
843 // runs with startPos == testStartLimit the last time through.
844 if (findProgressInterrupt(startPos, status))
845 return false;
846 }
847 } else {
848 for (;;) {
849 if (isLineTerminator(ch)) {
850 if (ch == 0x0d && startPos < fActiveLimit && UTEXT_CURRENT32(fInputText) == 0x0a) {
851 (void)UTEXT_NEXT32(fInputText);
852 startPos = UTEXT_GETNATIVEINDEX(fInputText);
853 }
854 MatchAt(startPos, false, status);
855 if (U_FAILURE(status)) {
856 return false;
857 }
858 if (fMatch) {
859 return true;
860 }
861 UTEXT_SETNATIVEINDEX(fInputText, startPos);
862 }
863 if (startPos >= testStartLimit) {
864 fMatch = false;
865 fHitEnd = true;
866 return false;
867 }
868 ch = UTEXT_NEXT32(fInputText);
869 startPos = UTEXT_GETNATIVEINDEX(fInputText);
870 // Note that it's perfectly OK for a pattern to have a zero-length
871 // match at the end of a string, so we must make sure that the loop
872 // runs with startPos == testStartLimit the last time through.
873 if (findProgressInterrupt(startPos, status))
874 return false;
875 }
876 }
877 }
878
879 default:
880 UPRV_UNREACHABLE_ASSERT;
881 // Unknown value in fPattern->fStartType, should be from StartOfMatch enum. But
882 // we have reports of this in production code, don't use UPRV_UNREACHABLE_EXIT.
883 // See ICU-21669.
884 status = U_INTERNAL_PROGRAM_ERROR;
885 return false;
886 }
887
888 UPRV_UNREACHABLE_EXIT;
889 }
890
891
892
find(int64_t start,UErrorCode & status)893 UBool RegexMatcher::find(int64_t start, UErrorCode &status) {
894 if (U_FAILURE(status)) {
895 return false;
896 }
897 if (U_FAILURE(fDeferredStatus)) {
898 status = fDeferredStatus;
899 return false;
900 }
901 this->reset(); // Note: Reset() is specified by Java Matcher documentation.
902 // This will reset the region to be the full input length.
903 if (start < 0) {
904 status = U_INDEX_OUTOFBOUNDS_ERROR;
905 return false;
906 }
907
908 int64_t nativeStart = start;
909 if (nativeStart < fActiveStart || nativeStart > fActiveLimit) {
910 status = U_INDEX_OUTOFBOUNDS_ERROR;
911 return false;
912 }
913 fMatchEnd = nativeStart;
914 return find(status);
915 }
916
917
918 //--------------------------------------------------------------------------------
919 //
920 // findUsingChunk() -- like find(), but with the advance knowledge that the
921 // entire string is available in the UText's chunk buffer.
922 //
923 //--------------------------------------------------------------------------------
findUsingChunk(UErrorCode & status)924 UBool RegexMatcher::findUsingChunk(UErrorCode &status) {
925 // Start at the position of the last match end. (Will be zero if the
926 // matcher has been reset.
927 //
928
929 int32_t startPos = (int32_t)fMatchEnd;
930 if (startPos==0) {
931 startPos = (int32_t)fActiveStart;
932 }
933
934 const char16_t *inputBuf = fInputText->chunkContents;
935
936 if (fMatch) {
937 // Save the position of any previous successful match.
938 fLastMatchEnd = fMatchEnd;
939
940 if (fMatchStart == fMatchEnd) {
941 // Previous match had zero length. Move start position up one position
942 // to avoid sending find() into a loop on zero-length matches.
943 if (startPos >= fActiveLimit) {
944 fMatch = false;
945 fHitEnd = true;
946 return false;
947 }
948 U16_FWD_1(inputBuf, startPos, fInputLength);
949 }
950 } else {
951 if (fLastMatchEnd >= 0) {
952 // A previous find() failed to match. Don't try again.
953 // (without this test, a pattern with a zero-length match
954 // could match again at the end of an input string.)
955 fHitEnd = true;
956 return false;
957 }
958 }
959
960
961 // Compute the position in the input string beyond which a match can not begin, because
962 // the minimum length match would extend past the end of the input.
963 // Note: some patterns that cannot match anything will have fMinMatchLength==Max Int.
964 // Be aware of possible overflows if making changes here.
965 // Note: a match can begin at inputBuf + testLen; it is an inclusive limit.
966 int32_t testLen = (int32_t)(fActiveLimit - fPattern->fMinMatchLen);
967 if (startPos > testLen) {
968 fMatch = false;
969 fHitEnd = true;
970 return false;
971 }
972
973 UChar32 c;
974 U_ASSERT(startPos >= 0);
975
976 switch (fPattern->fStartType) {
977 case START_NO_INFO:
978 // No optimization was found.
979 // Try a match at each input position.
980 for (;;) {
981 MatchChunkAt(startPos, false, status);
982 if (U_FAILURE(status)) {
983 return false;
984 }
985 if (fMatch) {
986 return true;
987 }
988 if (startPos >= testLen) {
989 fHitEnd = true;
990 return false;
991 }
992 U16_FWD_1(inputBuf, startPos, fActiveLimit);
993 // Note that it's perfectly OK for a pattern to have a zero-length
994 // match at the end of a string, so we must make sure that the loop
995 // runs with startPos == testLen the last time through.
996 if (findProgressInterrupt(startPos, status))
997 return false;
998 }
999 UPRV_UNREACHABLE_EXIT;
1000
1001 case START_START:
1002 // Matches are only possible at the start of the input string
1003 // (pattern begins with ^ or \A)
1004 if (startPos > fActiveStart) {
1005 fMatch = false;
1006 return false;
1007 }
1008 MatchChunkAt(startPos, false, status);
1009 if (U_FAILURE(status)) {
1010 return false;
1011 }
1012 return fMatch;
1013
1014
1015 case START_SET:
1016 {
1017 // Match may start on any char from a pre-computed set.
1018 U_ASSERT(fPattern->fMinMatchLen > 0);
1019 for (;;) {
1020 int32_t pos = startPos;
1021 U16_NEXT(inputBuf, startPos, fActiveLimit, c); // like c = inputBuf[startPos++];
1022 if ((c<256 && fPattern->fInitialChars8->contains(c)) ||
1023 (c>=256 && fPattern->fInitialChars->contains(c))) {
1024 MatchChunkAt(pos, false, status);
1025 if (U_FAILURE(status)) {
1026 return false;
1027 }
1028 if (fMatch) {
1029 return true;
1030 }
1031 }
1032 if (startPos > testLen) {
1033 fMatch = false;
1034 fHitEnd = true;
1035 return false;
1036 }
1037 if (findProgressInterrupt(startPos, status))
1038 return false;
1039 }
1040 }
1041 UPRV_UNREACHABLE_EXIT;
1042
1043 case START_STRING:
1044 case START_CHAR:
1045 {
1046 // Match starts on exactly one char.
1047 U_ASSERT(fPattern->fMinMatchLen > 0);
1048 UChar32 theChar = fPattern->fInitialChar;
1049 for (;;) {
1050 int32_t pos = startPos;
1051 U16_NEXT(inputBuf, startPos, fActiveLimit, c); // like c = inputBuf[startPos++];
1052 if (c == theChar) {
1053 MatchChunkAt(pos, false, status);
1054 if (U_FAILURE(status)) {
1055 return false;
1056 }
1057 if (fMatch) {
1058 return true;
1059 }
1060 }
1061 if (startPos > testLen) {
1062 fMatch = false;
1063 fHitEnd = true;
1064 return false;
1065 }
1066 if (findProgressInterrupt(startPos, status))
1067 return false;
1068 }
1069 }
1070 UPRV_UNREACHABLE_EXIT;
1071
1072 case START_LINE:
1073 {
1074 UChar32 ch;
1075 if (startPos == fAnchorStart) {
1076 MatchChunkAt(startPos, false, status);
1077 if (U_FAILURE(status)) {
1078 return false;
1079 }
1080 if (fMatch) {
1081 return true;
1082 }
1083 U16_FWD_1(inputBuf, startPos, fActiveLimit);
1084 }
1085
1086 if (fPattern->fFlags & UREGEX_UNIX_LINES) {
1087 for (;;) {
1088 ch = inputBuf[startPos-1];
1089 if (ch == 0x0a) {
1090 MatchChunkAt(startPos, false, status);
1091 if (U_FAILURE(status)) {
1092 return false;
1093 }
1094 if (fMatch) {
1095 return true;
1096 }
1097 }
1098 if (startPos >= testLen) {
1099 fMatch = false;
1100 fHitEnd = true;
1101 return false;
1102 }
1103 U16_FWD_1(inputBuf, startPos, fActiveLimit);
1104 // Note that it's perfectly OK for a pattern to have a zero-length
1105 // match at the end of a string, so we must make sure that the loop
1106 // runs with startPos == testLen the last time through.
1107 if (findProgressInterrupt(startPos, status))
1108 return false;
1109 }
1110 } else {
1111 for (;;) {
1112 ch = inputBuf[startPos-1];
1113 if (isLineTerminator(ch)) {
1114 if (ch == 0x0d && startPos < fActiveLimit && inputBuf[startPos] == 0x0a) {
1115 startPos++;
1116 }
1117 MatchChunkAt(startPos, false, status);
1118 if (U_FAILURE(status)) {
1119 return false;
1120 }
1121 if (fMatch) {
1122 return true;
1123 }
1124 }
1125 if (startPos >= testLen) {
1126 fMatch = false;
1127 fHitEnd = true;
1128 return false;
1129 }
1130 U16_FWD_1(inputBuf, startPos, fActiveLimit);
1131 // Note that it's perfectly OK for a pattern to have a zero-length
1132 // match at the end of a string, so we must make sure that the loop
1133 // runs with startPos == testLen the last time through.
1134 if (findProgressInterrupt(startPos, status))
1135 return false;
1136 }
1137 }
1138 }
1139
1140 default:
1141 UPRV_UNREACHABLE_ASSERT;
1142 // Unknown value in fPattern->fStartType, should be from StartOfMatch enum. But
1143 // we have reports of this in production code, don't use UPRV_UNREACHABLE_EXIT.
1144 // See ICU-21669.
1145 status = U_INTERNAL_PROGRAM_ERROR;
1146 return false;
1147 }
1148
1149 UPRV_UNREACHABLE_EXIT;
1150 }
1151
1152
1153
1154 //--------------------------------------------------------------------------------
1155 //
1156 // group()
1157 //
1158 //--------------------------------------------------------------------------------
group(UErrorCode & status) const1159 UnicodeString RegexMatcher::group(UErrorCode &status) const {
1160 return group(0, status);
1161 }
1162
1163 // Return immutable shallow clone
group(UText * dest,int64_t & group_len,UErrorCode & status) const1164 UText *RegexMatcher::group(UText *dest, int64_t &group_len, UErrorCode &status) const {
1165 return group(0, dest, group_len, status);
1166 }
1167
1168 // Return immutable shallow clone
group(int32_t groupNum,UText * dest,int64_t & group_len,UErrorCode & status) const1169 UText *RegexMatcher::group(int32_t groupNum, UText *dest, int64_t &group_len, UErrorCode &status) const {
1170 group_len = 0;
1171 if (U_FAILURE(status)) {
1172 return dest;
1173 }
1174 if (U_FAILURE(fDeferredStatus)) {
1175 status = fDeferredStatus;
1176 } else if (fMatch == false) {
1177 status = U_REGEX_INVALID_STATE;
1178 } else if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) {
1179 status = U_INDEX_OUTOFBOUNDS_ERROR;
1180 }
1181
1182 if (U_FAILURE(status)) {
1183 return dest;
1184 }
1185
1186 int64_t s, e;
1187 if (groupNum == 0) {
1188 s = fMatchStart;
1189 e = fMatchEnd;
1190 } else {
1191 int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1);
1192 U_ASSERT(groupOffset < fPattern->fFrameSize);
1193 U_ASSERT(groupOffset >= 0);
1194 s = fFrame->fExtra[groupOffset];
1195 e = fFrame->fExtra[groupOffset+1];
1196 }
1197
1198 if (s < 0) {
1199 // A capture group wasn't part of the match
1200 return utext_clone(dest, fInputText, false, true, &status);
1201 }
1202 U_ASSERT(s <= e);
1203 group_len = e - s;
1204
1205 dest = utext_clone(dest, fInputText, false, true, &status);
1206 if (dest)
1207 UTEXT_SETNATIVEINDEX(dest, s);
1208 return dest;
1209 }
1210
group(int32_t groupNum,UErrorCode & status) const1211 UnicodeString RegexMatcher::group(int32_t groupNum, UErrorCode &status) const {
1212 UnicodeString result;
1213 int64_t groupStart = start64(groupNum, status);
1214 int64_t groupEnd = end64(groupNum, status);
1215 if (U_FAILURE(status) || groupStart == -1 || groupStart == groupEnd) {
1216 return result;
1217 }
1218
1219 // Get the group length using a utext_extract preflight.
1220 // UText is actually pretty efficient at this when underlying encoding is UTF-16.
1221 int32_t length = utext_extract(fInputText, groupStart, groupEnd, nullptr, 0, &status);
1222 if (status != U_BUFFER_OVERFLOW_ERROR) {
1223 return result;
1224 }
1225
1226 status = U_ZERO_ERROR;
1227 char16_t *buf = result.getBuffer(length);
1228 if (buf == nullptr) {
1229 status = U_MEMORY_ALLOCATION_ERROR;
1230 } else {
1231 int32_t extractLength = utext_extract(fInputText, groupStart, groupEnd, buf, length, &status);
1232 result.releaseBuffer(extractLength);
1233 U_ASSERT(length == extractLength);
1234 }
1235 return result;
1236 }
1237
1238
1239 //--------------------------------------------------------------------------------
1240 //
1241 // appendGroup() -- currently internal only, appends a group to a UText rather
1242 // than replacing its contents
1243 //
1244 //--------------------------------------------------------------------------------
1245
appendGroup(int32_t groupNum,UText * dest,UErrorCode & status) const1246 int64_t RegexMatcher::appendGroup(int32_t groupNum, UText *dest, UErrorCode &status) const {
1247 if (U_FAILURE(status)) {
1248 return 0;
1249 }
1250 if (U_FAILURE(fDeferredStatus)) {
1251 status = fDeferredStatus;
1252 return 0;
1253 }
1254 int64_t destLen = utext_nativeLength(dest);
1255
1256 if (fMatch == false) {
1257 status = U_REGEX_INVALID_STATE;
1258 return utext_replace(dest, destLen, destLen, nullptr, 0, &status);
1259 }
1260 if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) {
1261 status = U_INDEX_OUTOFBOUNDS_ERROR;
1262 return utext_replace(dest, destLen, destLen, nullptr, 0, &status);
1263 }
1264
1265 int64_t s, e;
1266 if (groupNum == 0) {
1267 s = fMatchStart;
1268 e = fMatchEnd;
1269 } else {
1270 int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1);
1271 U_ASSERT(groupOffset < fPattern->fFrameSize);
1272 U_ASSERT(groupOffset >= 0);
1273 s = fFrame->fExtra[groupOffset];
1274 e = fFrame->fExtra[groupOffset+1];
1275 }
1276
1277 if (s < 0) {
1278 // A capture group wasn't part of the match
1279 return utext_replace(dest, destLen, destLen, nullptr, 0, &status);
1280 }
1281 U_ASSERT(s <= e);
1282
1283 int64_t deltaLen;
1284 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1285 U_ASSERT(e <= fInputLength);
1286 deltaLen = utext_replace(dest, destLen, destLen, fInputText->chunkContents+s, (int32_t)(e-s), &status);
1287 } else {
1288 int32_t len16;
1289 if (UTEXT_USES_U16(fInputText)) {
1290 len16 = (int32_t)(e-s);
1291 } else {
1292 UErrorCode lengthStatus = U_ZERO_ERROR;
1293 len16 = utext_extract(fInputText, s, e, nullptr, 0, &lengthStatus);
1294 }
1295 char16_t *groupChars = (char16_t *)uprv_malloc(sizeof(char16_t)*(len16+1));
1296 if (groupChars == nullptr) {
1297 status = U_MEMORY_ALLOCATION_ERROR;
1298 return 0;
1299 }
1300 utext_extract(fInputText, s, e, groupChars, len16+1, &status);
1301
1302 deltaLen = utext_replace(dest, destLen, destLen, groupChars, len16, &status);
1303 uprv_free(groupChars);
1304 }
1305 return deltaLen;
1306 }
1307
1308
1309
1310 //--------------------------------------------------------------------------------
1311 //
1312 // groupCount()
1313 //
1314 //--------------------------------------------------------------------------------
groupCount() const1315 int32_t RegexMatcher::groupCount() const {
1316 return fPattern->fGroupMap->size();
1317 }
1318
1319 //--------------------------------------------------------------------------------
1320 //
1321 // hasAnchoringBounds()
1322 //
1323 //--------------------------------------------------------------------------------
hasAnchoringBounds() const1324 UBool RegexMatcher::hasAnchoringBounds() const {
1325 return fAnchoringBounds;
1326 }
1327
1328
1329 //--------------------------------------------------------------------------------
1330 //
1331 // hasTransparentBounds()
1332 //
1333 //--------------------------------------------------------------------------------
hasTransparentBounds() const1334 UBool RegexMatcher::hasTransparentBounds() const {
1335 return fTransparentBounds;
1336 }
1337
1338
1339
1340 //--------------------------------------------------------------------------------
1341 //
1342 // hitEnd()
1343 //
1344 //--------------------------------------------------------------------------------
hitEnd() const1345 UBool RegexMatcher::hitEnd() const {
1346 return fHitEnd;
1347 }
1348
1349
1350 //--------------------------------------------------------------------------------
1351 //
1352 // input()
1353 //
1354 //--------------------------------------------------------------------------------
input() const1355 const UnicodeString &RegexMatcher::input() const {
1356 if (!fInput) {
1357 UErrorCode status = U_ZERO_ERROR;
1358 int32_t len16;
1359 if (UTEXT_USES_U16(fInputText)) {
1360 len16 = (int32_t)fInputLength;
1361 } else {
1362 len16 = utext_extract(fInputText, 0, fInputLength, nullptr, 0, &status);
1363 status = U_ZERO_ERROR; // overflow, length status
1364 }
1365 UnicodeString *result = new UnicodeString(len16, 0, 0);
1366
1367 char16_t *inputChars = result->getBuffer(len16);
1368 utext_extract(fInputText, 0, fInputLength, inputChars, len16, &status); // unterminated warning
1369 result->releaseBuffer(len16);
1370
1371 (*(const UnicodeString **)&fInput) = result; // pointer assignment, rather than operator=
1372 }
1373
1374 return *fInput;
1375 }
1376
1377 //--------------------------------------------------------------------------------
1378 //
1379 // inputText()
1380 //
1381 //--------------------------------------------------------------------------------
inputText() const1382 UText *RegexMatcher::inputText() const {
1383 return fInputText;
1384 }
1385
1386
1387 //--------------------------------------------------------------------------------
1388 //
1389 // getInput() -- like inputText(), but makes a clone or copies into another UText
1390 //
1391 //--------------------------------------------------------------------------------
getInput(UText * dest,UErrorCode & status) const1392 UText *RegexMatcher::getInput (UText *dest, UErrorCode &status) const {
1393 if (U_FAILURE(status)) {
1394 return dest;
1395 }
1396 if (U_FAILURE(fDeferredStatus)) {
1397 status = fDeferredStatus;
1398 return dest;
1399 }
1400
1401 if (dest) {
1402 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1403 utext_replace(dest, 0, utext_nativeLength(dest), fInputText->chunkContents, (int32_t)fInputLength, &status);
1404 } else {
1405 int32_t input16Len;
1406 if (UTEXT_USES_U16(fInputText)) {
1407 input16Len = (int32_t)fInputLength;
1408 } else {
1409 UErrorCode lengthStatus = U_ZERO_ERROR;
1410 input16Len = utext_extract(fInputText, 0, fInputLength, nullptr, 0, &lengthStatus); // buffer overflow error
1411 }
1412 char16_t *inputChars = (char16_t *)uprv_malloc(sizeof(char16_t)*(input16Len));
1413 if (inputChars == nullptr) {
1414 return dest;
1415 }
1416
1417 status = U_ZERO_ERROR;
1418 utext_extract(fInputText, 0, fInputLength, inputChars, input16Len, &status); // not terminated warning
1419 status = U_ZERO_ERROR;
1420 utext_replace(dest, 0, utext_nativeLength(dest), inputChars, input16Len, &status);
1421
1422 uprv_free(inputChars);
1423 }
1424 return dest;
1425 } else {
1426 return utext_clone(nullptr, fInputText, false, true, &status);
1427 }
1428 }
1429
1430
1431 static UBool compat_SyncMutableUTextContents(UText *ut);
compat_SyncMutableUTextContents(UText * ut)1432 static UBool compat_SyncMutableUTextContents(UText *ut) {
1433 UBool retVal = false;
1434
1435 // In the following test, we're really only interested in whether the UText should switch
1436 // between heap and stack allocation. If length hasn't changed, we won't, so the chunkContents
1437 // will still point to the correct data.
1438 if (utext_nativeLength(ut) != ut->nativeIndexingLimit) {
1439 UnicodeString *us=(UnicodeString *)ut->context;
1440
1441 // Update to the latest length.
1442 // For example, (utext_nativeLength(ut) != ut->nativeIndexingLimit).
1443 int32_t newLength = us->length();
1444
1445 // Update the chunk description.
1446 // The buffer may have switched between stack- and heap-based.
1447 ut->chunkContents = us->getBuffer();
1448 ut->chunkLength = newLength;
1449 ut->chunkNativeLimit = newLength;
1450 ut->nativeIndexingLimit = newLength;
1451 retVal = true;
1452 }
1453
1454 return retVal;
1455 }
1456
1457 //--------------------------------------------------------------------------------
1458 //
1459 // lookingAt()
1460 //
1461 //--------------------------------------------------------------------------------
lookingAt(UErrorCode & status)1462 UBool RegexMatcher::lookingAt(UErrorCode &status) {
1463 if (U_FAILURE(status)) {
1464 return false;
1465 }
1466 if (U_FAILURE(fDeferredStatus)) {
1467 status = fDeferredStatus;
1468 return false;
1469 }
1470
1471 if (fInputUniStrMaybeMutable) {
1472 if (compat_SyncMutableUTextContents(fInputText)) {
1473 fInputLength = utext_nativeLength(fInputText);
1474 reset();
1475 }
1476 }
1477 else {
1478 resetPreserveRegion();
1479 }
1480 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1481 MatchChunkAt((int32_t)fActiveStart, false, status);
1482 } else {
1483 MatchAt(fActiveStart, false, status);
1484 }
1485 return fMatch;
1486 }
1487
1488
lookingAt(int64_t start,UErrorCode & status)1489 UBool RegexMatcher::lookingAt(int64_t start, UErrorCode &status) {
1490 if (U_FAILURE(status)) {
1491 return false;
1492 }
1493 if (U_FAILURE(fDeferredStatus)) {
1494 status = fDeferredStatus;
1495 return false;
1496 }
1497 reset();
1498
1499 if (start < 0) {
1500 status = U_INDEX_OUTOFBOUNDS_ERROR;
1501 return false;
1502 }
1503
1504 if (fInputUniStrMaybeMutable) {
1505 if (compat_SyncMutableUTextContents(fInputText)) {
1506 fInputLength = utext_nativeLength(fInputText);
1507 reset();
1508 }
1509 }
1510
1511 int64_t nativeStart;
1512 nativeStart = start;
1513 if (nativeStart < fActiveStart || nativeStart > fActiveLimit) {
1514 status = U_INDEX_OUTOFBOUNDS_ERROR;
1515 return false;
1516 }
1517
1518 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1519 MatchChunkAt((int32_t)nativeStart, false, status);
1520 } else {
1521 MatchAt(nativeStart, false, status);
1522 }
1523 return fMatch;
1524 }
1525
1526
1527
1528 //--------------------------------------------------------------------------------
1529 //
1530 // matches()
1531 //
1532 //--------------------------------------------------------------------------------
matches(UErrorCode & status)1533 UBool RegexMatcher::matches(UErrorCode &status) {
1534 if (U_FAILURE(status)) {
1535 return false;
1536 }
1537 if (U_FAILURE(fDeferredStatus)) {
1538 status = fDeferredStatus;
1539 return false;
1540 }
1541
1542 if (fInputUniStrMaybeMutable) {
1543 if (compat_SyncMutableUTextContents(fInputText)) {
1544 fInputLength = utext_nativeLength(fInputText);
1545 reset();
1546 }
1547 }
1548 else {
1549 resetPreserveRegion();
1550 }
1551
1552 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1553 MatchChunkAt((int32_t)fActiveStart, true, status);
1554 } else {
1555 MatchAt(fActiveStart, true, status);
1556 }
1557 return fMatch;
1558 }
1559
1560
matches(int64_t start,UErrorCode & status)1561 UBool RegexMatcher::matches(int64_t start, UErrorCode &status) {
1562 if (U_FAILURE(status)) {
1563 return false;
1564 }
1565 if (U_FAILURE(fDeferredStatus)) {
1566 status = fDeferredStatus;
1567 return false;
1568 }
1569 reset();
1570
1571 if (start < 0) {
1572 status = U_INDEX_OUTOFBOUNDS_ERROR;
1573 return false;
1574 }
1575
1576 if (fInputUniStrMaybeMutable) {
1577 if (compat_SyncMutableUTextContents(fInputText)) {
1578 fInputLength = utext_nativeLength(fInputText);
1579 reset();
1580 }
1581 }
1582
1583 int64_t nativeStart;
1584 nativeStart = start;
1585 if (nativeStart < fActiveStart || nativeStart > fActiveLimit) {
1586 status = U_INDEX_OUTOFBOUNDS_ERROR;
1587 return false;
1588 }
1589
1590 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1591 MatchChunkAt((int32_t)nativeStart, true, status);
1592 } else {
1593 MatchAt(nativeStart, true, status);
1594 }
1595 return fMatch;
1596 }
1597
1598
1599
1600 //--------------------------------------------------------------------------------
1601 //
1602 // pattern
1603 //
1604 //--------------------------------------------------------------------------------
pattern() const1605 const RegexPattern &RegexMatcher::pattern() const {
1606 return *fPattern;
1607 }
1608
1609
1610
1611 //--------------------------------------------------------------------------------
1612 //
1613 // region
1614 //
1615 //--------------------------------------------------------------------------------
region(int64_t regionStart,int64_t regionLimit,int64_t startIndex,UErrorCode & status)1616 RegexMatcher &RegexMatcher::region(int64_t regionStart, int64_t regionLimit, int64_t startIndex, UErrorCode &status) {
1617 if (U_FAILURE(status)) {
1618 return *this;
1619 }
1620
1621 if (regionStart>regionLimit || regionStart<0 || regionLimit<0) {
1622 status = U_ILLEGAL_ARGUMENT_ERROR;
1623 }
1624
1625 int64_t nativeStart = regionStart;
1626 int64_t nativeLimit = regionLimit;
1627 if (nativeStart > fInputLength || nativeLimit > fInputLength) {
1628 status = U_ILLEGAL_ARGUMENT_ERROR;
1629 }
1630
1631 if (startIndex == -1)
1632 this->reset();
1633 else
1634 resetPreserveRegion();
1635
1636 fRegionStart = nativeStart;
1637 fRegionLimit = nativeLimit;
1638 fActiveStart = nativeStart;
1639 fActiveLimit = nativeLimit;
1640
1641 if (startIndex != -1) {
1642 if (startIndex < fActiveStart || startIndex > fActiveLimit) {
1643 status = U_INDEX_OUTOFBOUNDS_ERROR;
1644 }
1645 fMatchEnd = startIndex;
1646 }
1647
1648 if (!fTransparentBounds) {
1649 fLookStart = nativeStart;
1650 fLookLimit = nativeLimit;
1651 }
1652 if (fAnchoringBounds) {
1653 fAnchorStart = nativeStart;
1654 fAnchorLimit = nativeLimit;
1655 }
1656 return *this;
1657 }
1658
region(int64_t start,int64_t limit,UErrorCode & status)1659 RegexMatcher &RegexMatcher::region(int64_t start, int64_t limit, UErrorCode &status) {
1660 return region(start, limit, -1, status);
1661 }
1662
1663 //--------------------------------------------------------------------------------
1664 //
1665 // regionEnd
1666 //
1667 //--------------------------------------------------------------------------------
regionEnd() const1668 int32_t RegexMatcher::regionEnd() const {
1669 return (int32_t)fRegionLimit;
1670 }
1671
regionEnd64() const1672 int64_t RegexMatcher::regionEnd64() const {
1673 return fRegionLimit;
1674 }
1675
1676 //--------------------------------------------------------------------------------
1677 //
1678 // regionStart
1679 //
1680 //--------------------------------------------------------------------------------
regionStart() const1681 int32_t RegexMatcher::regionStart() const {
1682 return (int32_t)fRegionStart;
1683 }
1684
regionStart64() const1685 int64_t RegexMatcher::regionStart64() const {
1686 return fRegionStart;
1687 }
1688
1689
1690 //--------------------------------------------------------------------------------
1691 //
1692 // replaceAll
1693 //
1694 //--------------------------------------------------------------------------------
replaceAll(const UnicodeString & replacement,UErrorCode & status)1695 UnicodeString RegexMatcher::replaceAll(const UnicodeString &replacement, UErrorCode &status) {
1696 UText replacementText = UTEXT_INITIALIZER;
1697 UText resultText = UTEXT_INITIALIZER;
1698 UnicodeString resultString;
1699 if (U_FAILURE(status)) {
1700 return resultString;
1701 }
1702
1703 utext_openConstUnicodeString(&replacementText, &replacement, &status);
1704 utext_openUnicodeString(&resultText, &resultString, &status);
1705
1706 replaceAll(&replacementText, &resultText, status);
1707
1708 utext_close(&resultText);
1709 utext_close(&replacementText);
1710
1711 return resultString;
1712 }
1713
1714
1715 //
1716 // replaceAll, UText mode
1717 //
replaceAll(UText * replacement,UText * dest,UErrorCode & status)1718 UText *RegexMatcher::replaceAll(UText *replacement, UText *dest, UErrorCode &status) {
1719 if (U_FAILURE(status)) {
1720 return dest;
1721 }
1722 if (U_FAILURE(fDeferredStatus)) {
1723 status = fDeferredStatus;
1724 return dest;
1725 }
1726
1727 if (dest == nullptr) {
1728 UnicodeString emptyString;
1729 UText empty = UTEXT_INITIALIZER;
1730
1731 utext_openUnicodeString(&empty, &emptyString, &status);
1732 dest = utext_clone(nullptr, &empty, true, false, &status);
1733 utext_close(&empty);
1734 }
1735
1736 if (U_SUCCESS(status)) {
1737 reset();
1738 while (find()) {
1739 appendReplacement(dest, replacement, status);
1740 if (U_FAILURE(status)) {
1741 break;
1742 }
1743 }
1744 appendTail(dest, status);
1745 }
1746
1747 return dest;
1748 }
1749
1750
1751 //--------------------------------------------------------------------------------
1752 //
1753 // replaceFirst
1754 //
1755 //--------------------------------------------------------------------------------
replaceFirst(const UnicodeString & replacement,UErrorCode & status)1756 UnicodeString RegexMatcher::replaceFirst(const UnicodeString &replacement, UErrorCode &status) {
1757 UText replacementText = UTEXT_INITIALIZER;
1758 UText resultText = UTEXT_INITIALIZER;
1759 UnicodeString resultString;
1760
1761 utext_openConstUnicodeString(&replacementText, &replacement, &status);
1762 utext_openUnicodeString(&resultText, &resultString, &status);
1763
1764 replaceFirst(&replacementText, &resultText, status);
1765
1766 utext_close(&resultText);
1767 utext_close(&replacementText);
1768
1769 return resultString;
1770 }
1771
1772 //
1773 // replaceFirst, UText mode
1774 //
replaceFirst(UText * replacement,UText * dest,UErrorCode & status)1775 UText *RegexMatcher::replaceFirst(UText *replacement, UText *dest, UErrorCode &status) {
1776 if (U_FAILURE(status)) {
1777 return dest;
1778 }
1779 if (U_FAILURE(fDeferredStatus)) {
1780 status = fDeferredStatus;
1781 return dest;
1782 }
1783
1784 reset();
1785 if (!find()) {
1786 return getInput(dest, status);
1787 }
1788
1789 if (dest == nullptr) {
1790 UnicodeString emptyString;
1791 UText empty = UTEXT_INITIALIZER;
1792
1793 utext_openUnicodeString(&empty, &emptyString, &status);
1794 dest = utext_clone(nullptr, &empty, true, false, &status);
1795 utext_close(&empty);
1796 }
1797
1798 appendReplacement(dest, replacement, status);
1799 appendTail(dest, status);
1800
1801 return dest;
1802 }
1803
1804
1805 //--------------------------------------------------------------------------------
1806 //
1807 // requireEnd
1808 //
1809 //--------------------------------------------------------------------------------
requireEnd() const1810 UBool RegexMatcher::requireEnd() const {
1811 return fRequireEnd;
1812 }
1813
1814
1815 //--------------------------------------------------------------------------------
1816 //
1817 // reset
1818 //
1819 //--------------------------------------------------------------------------------
reset()1820 RegexMatcher &RegexMatcher::reset() {
1821 fRegionStart = 0;
1822 fRegionLimit = fInputLength;
1823 fActiveStart = 0;
1824 fActiveLimit = fInputLength;
1825 fAnchorStart = 0;
1826 fAnchorLimit = fInputLength;
1827 fLookStart = 0;
1828 fLookLimit = fInputLength;
1829 resetPreserveRegion();
1830 return *this;
1831 }
1832
1833
1834
resetPreserveRegion()1835 void RegexMatcher::resetPreserveRegion() {
1836 fMatchStart = 0;
1837 fMatchEnd = 0;
1838 fLastMatchEnd = -1;
1839 fAppendPosition = 0;
1840 fMatch = false;
1841 fHitEnd = false;
1842 fRequireEnd = false;
1843 fTime = 0;
1844 fTickCounter = TIMER_INITIAL_VALUE;
1845 //resetStack(); // more expensive than it looks...
1846 }
1847
1848
reset(const UnicodeString & input)1849 RegexMatcher &RegexMatcher::reset(const UnicodeString &input) {
1850 fInputText = utext_openConstUnicodeString(fInputText, &input, &fDeferredStatus);
1851 if (fPattern->fNeedsAltInput) {
1852 fAltInputText = utext_clone(fAltInputText, fInputText, false, true, &fDeferredStatus);
1853 }
1854 if (U_FAILURE(fDeferredStatus)) {
1855 return *this;
1856 }
1857 fInputLength = utext_nativeLength(fInputText);
1858
1859 reset();
1860 delete fInput;
1861 fInput = nullptr;
1862
1863 // Do the following for any UnicodeString.
1864 // This is for compatibility for those clients who modify the input string "live" during regex operations.
1865 fInputUniStrMaybeMutable = true;
1866
1867 #if UCONFIG_NO_BREAK_ITERATION==0
1868 if (fWordBreakItr) {
1869 fWordBreakItr->setText(fInputText, fDeferredStatus);
1870 }
1871 if (fGCBreakItr) {
1872 fGCBreakItr->setText(fInputText, fDeferredStatus);
1873 }
1874 #endif
1875
1876 return *this;
1877 }
1878
1879
reset(UText * input)1880 RegexMatcher &RegexMatcher::reset(UText *input) {
1881 if (fInputText != input) {
1882 fInputText = utext_clone(fInputText, input, false, true, &fDeferredStatus);
1883 if (fPattern->fNeedsAltInput) fAltInputText = utext_clone(fAltInputText, fInputText, false, true, &fDeferredStatus);
1884 if (U_FAILURE(fDeferredStatus)) {
1885 return *this;
1886 }
1887 fInputLength = utext_nativeLength(fInputText);
1888
1889 delete fInput;
1890 fInput = nullptr;
1891
1892 #if UCONFIG_NO_BREAK_ITERATION==0
1893 if (fWordBreakItr) {
1894 fWordBreakItr->setText(input, fDeferredStatus);
1895 }
1896 if (fGCBreakItr) {
1897 fGCBreakItr->setText(fInputText, fDeferredStatus);
1898 }
1899 #endif
1900 }
1901 reset();
1902 fInputUniStrMaybeMutable = false;
1903
1904 return *this;
1905 }
1906
1907 /*RegexMatcher &RegexMatcher::reset(const char16_t *) {
1908 fDeferredStatus = U_INTERNAL_PROGRAM_ERROR;
1909 return *this;
1910 }*/
1911
reset(int64_t position,UErrorCode & status)1912 RegexMatcher &RegexMatcher::reset(int64_t position, UErrorCode &status) {
1913 if (U_FAILURE(status)) {
1914 return *this;
1915 }
1916 reset(); // Reset also resets the region to be the entire string.
1917
1918 if (position < 0 || position > fActiveLimit) {
1919 status = U_INDEX_OUTOFBOUNDS_ERROR;
1920 return *this;
1921 }
1922 fMatchEnd = position;
1923 return *this;
1924 }
1925
1926
1927 //--------------------------------------------------------------------------------
1928 //
1929 // refresh
1930 //
1931 //--------------------------------------------------------------------------------
refreshInputText(UText * input,UErrorCode & status)1932 RegexMatcher &RegexMatcher::refreshInputText(UText *input, UErrorCode &status) {
1933 if (U_FAILURE(status)) {
1934 return *this;
1935 }
1936 if (input == nullptr) {
1937 status = U_ILLEGAL_ARGUMENT_ERROR;
1938 return *this;
1939 }
1940 if (utext_nativeLength(fInputText) != utext_nativeLength(input)) {
1941 status = U_ILLEGAL_ARGUMENT_ERROR;
1942 return *this;
1943 }
1944 int64_t pos = utext_getNativeIndex(fInputText);
1945 // Shallow read-only clone of the new UText into the existing input UText
1946 fInputText = utext_clone(fInputText, input, false, true, &status);
1947 if (U_FAILURE(status)) {
1948 return *this;
1949 }
1950 utext_setNativeIndex(fInputText, pos);
1951
1952 if (fAltInputText != nullptr) {
1953 pos = utext_getNativeIndex(fAltInputText);
1954 fAltInputText = utext_clone(fAltInputText, input, false, true, &status);
1955 if (U_FAILURE(status)) {
1956 return *this;
1957 }
1958 utext_setNativeIndex(fAltInputText, pos);
1959 }
1960 return *this;
1961 }
1962
1963
1964
1965 //--------------------------------------------------------------------------------
1966 //
1967 // setTrace
1968 //
1969 //--------------------------------------------------------------------------------
setTrace(UBool state)1970 void RegexMatcher::setTrace(UBool state) {
1971 fTraceDebug = state;
1972 }
1973
1974
1975
1976 /**
1977 * UText, replace entire contents of the destination UText with a substring of the source UText.
1978 *
1979 * @param src The source UText
1980 * @param dest The destination UText. Must be writable.
1981 * May be nullptr, in which case a new UText will be allocated.
1982 * @param start Start index of source substring.
1983 * @param limit Limit index of source substring.
1984 * @param status An error code.
1985 */
utext_extract_replace(UText * src,UText * dest,int64_t start,int64_t limit,UErrorCode * status)1986 static UText *utext_extract_replace(UText *src, UText *dest, int64_t start, int64_t limit, UErrorCode *status) {
1987 if (U_FAILURE(*status)) {
1988 return dest;
1989 }
1990 if (start == limit) {
1991 if (dest) {
1992 utext_replace(dest, 0, utext_nativeLength(dest), nullptr, 0, status);
1993 return dest;
1994 } else {
1995 return utext_openUChars(nullptr, nullptr, 0, status);
1996 }
1997 }
1998 int32_t length = utext_extract(src, start, limit, nullptr, 0, status);
1999 if (*status != U_BUFFER_OVERFLOW_ERROR && U_FAILURE(*status)) {
2000 return dest;
2001 }
2002 *status = U_ZERO_ERROR;
2003 MaybeStackArray<char16_t, 40> buffer;
2004 if (length >= buffer.getCapacity()) {
2005 char16_t *newBuf = buffer.resize(length+1); // Leave space for terminating Nul.
2006 if (newBuf == nullptr) {
2007 *status = U_MEMORY_ALLOCATION_ERROR;
2008 }
2009 }
2010 utext_extract(src, start, limit, buffer.getAlias(), length+1, status);
2011 if (dest) {
2012 utext_replace(dest, 0, utext_nativeLength(dest), buffer.getAlias(), length, status);
2013 return dest;
2014 }
2015
2016 // Caller did not provide a preexisting UText.
2017 // Open a new one, and have it adopt the text buffer storage.
2018 if (U_FAILURE(*status)) {
2019 return nullptr;
2020 }
2021 int32_t ownedLength = 0;
2022 char16_t *ownedBuf = buffer.orphanOrClone(length+1, ownedLength);
2023 if (ownedBuf == nullptr) {
2024 *status = U_MEMORY_ALLOCATION_ERROR;
2025 return nullptr;
2026 }
2027 UText *result = utext_openUChars(nullptr, ownedBuf, length, status);
2028 if (U_FAILURE(*status)) {
2029 uprv_free(ownedBuf);
2030 return nullptr;
2031 }
2032 result->providerProperties |= (1 << UTEXT_PROVIDER_OWNS_TEXT);
2033 return result;
2034 }
2035
2036
2037 //---------------------------------------------------------------------
2038 //
2039 // split
2040 //
2041 //---------------------------------------------------------------------
split(const UnicodeString & input,UnicodeString dest[],int32_t destCapacity,UErrorCode & status)2042 int32_t RegexMatcher::split(const UnicodeString &input,
2043 UnicodeString dest[],
2044 int32_t destCapacity,
2045 UErrorCode &status)
2046 {
2047 UText inputText = UTEXT_INITIALIZER;
2048 utext_openConstUnicodeString(&inputText, &input, &status);
2049 if (U_FAILURE(status)) {
2050 return 0;
2051 }
2052
2053 UText **destText = (UText **)uprv_malloc(sizeof(UText*)*destCapacity);
2054 if (destText == nullptr) {
2055 status = U_MEMORY_ALLOCATION_ERROR;
2056 return 0;
2057 }
2058 int32_t i;
2059 for (i = 0; i < destCapacity; i++) {
2060 destText[i] = utext_openUnicodeString(nullptr, &dest[i], &status);
2061 }
2062
2063 int32_t fieldCount = split(&inputText, destText, destCapacity, status);
2064
2065 for (i = 0; i < destCapacity; i++) {
2066 utext_close(destText[i]);
2067 }
2068
2069 uprv_free(destText);
2070 utext_close(&inputText);
2071 return fieldCount;
2072 }
2073
2074 //
2075 // split, UText mode
2076 //
split(UText * input,UText * dest[],int32_t destCapacity,UErrorCode & status)2077 int32_t RegexMatcher::split(UText *input,
2078 UText *dest[],
2079 int32_t destCapacity,
2080 UErrorCode &status)
2081 {
2082 //
2083 // Check arguments for validity
2084 //
2085 if (U_FAILURE(status)) {
2086 return 0;
2087 }
2088
2089 if (destCapacity < 1) {
2090 status = U_ILLEGAL_ARGUMENT_ERROR;
2091 return 0;
2092 }
2093
2094 //
2095 // Reset for the input text
2096 //
2097 reset(input);
2098 int64_t nextOutputStringStart = 0;
2099 if (fActiveLimit == 0) {
2100 return 0;
2101 }
2102
2103 //
2104 // Loop through the input text, searching for the delimiter pattern
2105 //
2106 int32_t i;
2107 int32_t numCaptureGroups = fPattern->fGroupMap->size();
2108 for (i=0; ; i++) {
2109 if (i>=destCapacity-1) {
2110 // There is one or zero output string left.
2111 // Fill the last output string with whatever is left from the input, then exit the loop.
2112 // ( i will be == destCapacity if we filled the output array while processing
2113 // capture groups of the delimiter expression, in which case we will discard the
2114 // last capture group saved in favor of the unprocessed remainder of the
2115 // input string.)
2116 i = destCapacity-1;
2117 if (fActiveLimit > nextOutputStringStart) {
2118 if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) {
2119 if (dest[i]) {
2120 utext_replace(dest[i], 0, utext_nativeLength(dest[i]),
2121 input->chunkContents+nextOutputStringStart,
2122 (int32_t)(fActiveLimit-nextOutputStringStart), &status);
2123 } else {
2124 UText remainingText = UTEXT_INITIALIZER;
2125 utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart,
2126 fActiveLimit-nextOutputStringStart, &status);
2127 dest[i] = utext_clone(nullptr, &remainingText, true, false, &status);
2128 utext_close(&remainingText);
2129 }
2130 } else {
2131 UErrorCode lengthStatus = U_ZERO_ERROR;
2132 int32_t remaining16Length =
2133 utext_extract(input, nextOutputStringStart, fActiveLimit, nullptr, 0, &lengthStatus);
2134 char16_t *remainingChars = (char16_t *)uprv_malloc(sizeof(char16_t)*(remaining16Length+1));
2135 if (remainingChars == nullptr) {
2136 status = U_MEMORY_ALLOCATION_ERROR;
2137 break;
2138 }
2139
2140 utext_extract(input, nextOutputStringStart, fActiveLimit, remainingChars, remaining16Length+1, &status);
2141 if (dest[i]) {
2142 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status);
2143 } else {
2144 UText remainingText = UTEXT_INITIALIZER;
2145 utext_openUChars(&remainingText, remainingChars, remaining16Length, &status);
2146 dest[i] = utext_clone(nullptr, &remainingText, true, false, &status);
2147 utext_close(&remainingText);
2148 }
2149
2150 uprv_free(remainingChars);
2151 }
2152 }
2153 break;
2154 }
2155 if (find()) {
2156 // We found another delimiter. Move everything from where we started looking
2157 // up until the start of the delimiter into the next output string.
2158 if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) {
2159 if (dest[i]) {
2160 utext_replace(dest[i], 0, utext_nativeLength(dest[i]),
2161 input->chunkContents+nextOutputStringStart,
2162 (int32_t)(fMatchStart-nextOutputStringStart), &status);
2163 } else {
2164 UText remainingText = UTEXT_INITIALIZER;
2165 utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart,
2166 fMatchStart-nextOutputStringStart, &status);
2167 dest[i] = utext_clone(nullptr, &remainingText, true, false, &status);
2168 utext_close(&remainingText);
2169 }
2170 } else {
2171 UErrorCode lengthStatus = U_ZERO_ERROR;
2172 int32_t remaining16Length = utext_extract(input, nextOutputStringStart, fMatchStart, nullptr, 0, &lengthStatus);
2173 char16_t *remainingChars = (char16_t *)uprv_malloc(sizeof(char16_t)*(remaining16Length+1));
2174 if (remainingChars == nullptr) {
2175 status = U_MEMORY_ALLOCATION_ERROR;
2176 break;
2177 }
2178 utext_extract(input, nextOutputStringStart, fMatchStart, remainingChars, remaining16Length+1, &status);
2179 if (dest[i]) {
2180 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status);
2181 } else {
2182 UText remainingText = UTEXT_INITIALIZER;
2183 utext_openUChars(&remainingText, remainingChars, remaining16Length, &status);
2184 dest[i] = utext_clone(nullptr, &remainingText, true, false, &status);
2185 utext_close(&remainingText);
2186 }
2187
2188 uprv_free(remainingChars);
2189 }
2190 nextOutputStringStart = fMatchEnd;
2191
2192 // If the delimiter pattern has capturing parentheses, the captured
2193 // text goes out into the next n destination strings.
2194 int32_t groupNum;
2195 for (groupNum=1; groupNum<=numCaptureGroups; groupNum++) {
2196 if (i >= destCapacity-2) {
2197 // Never fill the last available output string with capture group text.
2198 // It will filled with the last field, the remainder of the
2199 // unsplit input text.
2200 break;
2201 }
2202 i++;
2203 dest[i] = utext_extract_replace(fInputText, dest[i],
2204 start64(groupNum, status), end64(groupNum, status), &status);
2205 }
2206
2207 if (nextOutputStringStart == fActiveLimit) {
2208 // The delimiter was at the end of the string. We're done, but first
2209 // we output one last empty string, for the empty field following
2210 // the delimiter at the end of input.
2211 if (i+1 < destCapacity) {
2212 ++i;
2213 if (dest[i] == nullptr) {
2214 dest[i] = utext_openUChars(nullptr, nullptr, 0, &status);
2215 } else {
2216 static const char16_t emptyString[] = {(char16_t)0};
2217 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), emptyString, 0, &status);
2218 }
2219 }
2220 break;
2221
2222 }
2223 }
2224 else
2225 {
2226 // We ran off the end of the input while looking for the next delimiter.
2227 // All the remaining text goes into the current output string.
2228 if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) {
2229 if (dest[i]) {
2230 utext_replace(dest[i], 0, utext_nativeLength(dest[i]),
2231 input->chunkContents+nextOutputStringStart,
2232 (int32_t)(fActiveLimit-nextOutputStringStart), &status);
2233 } else {
2234 UText remainingText = UTEXT_INITIALIZER;
2235 utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart,
2236 fActiveLimit-nextOutputStringStart, &status);
2237 dest[i] = utext_clone(nullptr, &remainingText, true, false, &status);
2238 utext_close(&remainingText);
2239 }
2240 } else {
2241 UErrorCode lengthStatus = U_ZERO_ERROR;
2242 int32_t remaining16Length = utext_extract(input, nextOutputStringStart, fActiveLimit, nullptr, 0, &lengthStatus);
2243 char16_t *remainingChars = (char16_t *)uprv_malloc(sizeof(char16_t)*(remaining16Length+1));
2244 if (remainingChars == nullptr) {
2245 status = U_MEMORY_ALLOCATION_ERROR;
2246 break;
2247 }
2248
2249 utext_extract(input, nextOutputStringStart, fActiveLimit, remainingChars, remaining16Length+1, &status);
2250 if (dest[i]) {
2251 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status);
2252 } else {
2253 UText remainingText = UTEXT_INITIALIZER;
2254 utext_openUChars(&remainingText, remainingChars, remaining16Length, &status);
2255 dest[i] = utext_clone(nullptr, &remainingText, true, false, &status);
2256 utext_close(&remainingText);
2257 }
2258
2259 uprv_free(remainingChars);
2260 }
2261 break;
2262 }
2263 if (U_FAILURE(status)) {
2264 break;
2265 }
2266 } // end of for loop
2267 return i+1;
2268 }
2269
2270
2271 //--------------------------------------------------------------------------------
2272 //
2273 // start
2274 //
2275 //--------------------------------------------------------------------------------
start(UErrorCode & status) const2276 int32_t RegexMatcher::start(UErrorCode &status) const {
2277 return start(0, status);
2278 }
2279
start64(UErrorCode & status) const2280 int64_t RegexMatcher::start64(UErrorCode &status) const {
2281 return start64(0, status);
2282 }
2283
2284 //--------------------------------------------------------------------------------
2285 //
2286 // start(int32_t group, UErrorCode &status)
2287 //
2288 //--------------------------------------------------------------------------------
2289
start64(int32_t group,UErrorCode & status) const2290 int64_t RegexMatcher::start64(int32_t group, UErrorCode &status) const {
2291 if (U_FAILURE(status)) {
2292 return -1;
2293 }
2294 if (U_FAILURE(fDeferredStatus)) {
2295 status = fDeferredStatus;
2296 return -1;
2297 }
2298 if (fMatch == false) {
2299 status = U_REGEX_INVALID_STATE;
2300 return -1;
2301 }
2302 if (group < 0 || group > fPattern->fGroupMap->size()) {
2303 status = U_INDEX_OUTOFBOUNDS_ERROR;
2304 return -1;
2305 }
2306 int64_t s;
2307 if (group == 0) {
2308 s = fMatchStart;
2309 } else {
2310 int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1);
2311 U_ASSERT(groupOffset < fPattern->fFrameSize);
2312 U_ASSERT(groupOffset >= 0);
2313 s = fFrame->fExtra[groupOffset];
2314 }
2315
2316 return s;
2317 }
2318
2319
start(int32_t group,UErrorCode & status) const2320 int32_t RegexMatcher::start(int32_t group, UErrorCode &status) const {
2321 return (int32_t)start64(group, status);
2322 }
2323
2324 //--------------------------------------------------------------------------------
2325 //
2326 // useAnchoringBounds
2327 //
2328 //--------------------------------------------------------------------------------
useAnchoringBounds(UBool b)2329 RegexMatcher &RegexMatcher::useAnchoringBounds(UBool b) {
2330 fAnchoringBounds = b;
2331 fAnchorStart = (fAnchoringBounds ? fRegionStart : 0);
2332 fAnchorLimit = (fAnchoringBounds ? fRegionLimit : fInputLength);
2333 return *this;
2334 }
2335
2336
2337 //--------------------------------------------------------------------------------
2338 //
2339 // useTransparentBounds
2340 //
2341 //--------------------------------------------------------------------------------
useTransparentBounds(UBool b)2342 RegexMatcher &RegexMatcher::useTransparentBounds(UBool b) {
2343 fTransparentBounds = b;
2344 fLookStart = (fTransparentBounds ? 0 : fRegionStart);
2345 fLookLimit = (fTransparentBounds ? fInputLength : fRegionLimit);
2346 return *this;
2347 }
2348
2349 //--------------------------------------------------------------------------------
2350 //
2351 // setTimeLimit
2352 //
2353 //--------------------------------------------------------------------------------
setTimeLimit(int32_t limit,UErrorCode & status)2354 void RegexMatcher::setTimeLimit(int32_t limit, UErrorCode &status) {
2355 if (U_FAILURE(status)) {
2356 return;
2357 }
2358 if (U_FAILURE(fDeferredStatus)) {
2359 status = fDeferredStatus;
2360 return;
2361 }
2362 if (limit < 0) {
2363 status = U_ILLEGAL_ARGUMENT_ERROR;
2364 return;
2365 }
2366 fTimeLimit = limit;
2367 }
2368
2369
2370 //--------------------------------------------------------------------------------
2371 //
2372 // getTimeLimit
2373 //
2374 //--------------------------------------------------------------------------------
getTimeLimit() const2375 int32_t RegexMatcher::getTimeLimit() const {
2376 return fTimeLimit;
2377 }
2378
2379
2380 //--------------------------------------------------------------------------------
2381 //
2382 // setStackLimit
2383 //
2384 //--------------------------------------------------------------------------------
setStackLimit(int32_t limit,UErrorCode & status)2385 void RegexMatcher::setStackLimit(int32_t limit, UErrorCode &status) {
2386 if (U_FAILURE(status)) {
2387 return;
2388 }
2389 if (U_FAILURE(fDeferredStatus)) {
2390 status = fDeferredStatus;
2391 return;
2392 }
2393 if (limit < 0) {
2394 status = U_ILLEGAL_ARGUMENT_ERROR;
2395 return;
2396 }
2397
2398 // Reset the matcher. This is needed here in case there is a current match
2399 // whose final stack frame (containing the match results, pointed to by fFrame)
2400 // would be lost by resizing to a smaller stack size.
2401 reset();
2402
2403 if (limit == 0) {
2404 // Unlimited stack expansion
2405 fStack->setMaxCapacity(0);
2406 } else {
2407 // Change the units of the limit from bytes to ints, and bump the size up
2408 // to be big enough to hold at least one stack frame for the pattern,
2409 // if it isn't there already.
2410 int32_t adjustedLimit = limit / sizeof(int32_t);
2411 if (adjustedLimit < fPattern->fFrameSize) {
2412 adjustedLimit = fPattern->fFrameSize;
2413 }
2414 fStack->setMaxCapacity(adjustedLimit);
2415 }
2416 fStackLimit = limit;
2417 }
2418
2419
2420 //--------------------------------------------------------------------------------
2421 //
2422 // getStackLimit
2423 //
2424 //--------------------------------------------------------------------------------
getStackLimit() const2425 int32_t RegexMatcher::getStackLimit() const {
2426 return fStackLimit;
2427 }
2428
2429
2430 //--------------------------------------------------------------------------------
2431 //
2432 // setMatchCallback
2433 //
2434 //--------------------------------------------------------------------------------
setMatchCallback(URegexMatchCallback * callback,const void * context,UErrorCode & status)2435 void RegexMatcher::setMatchCallback(URegexMatchCallback *callback,
2436 const void *context,
2437 UErrorCode &status) {
2438 if (U_FAILURE(status)) {
2439 return;
2440 }
2441 fCallbackFn = callback;
2442 fCallbackContext = context;
2443 }
2444
2445
2446 //--------------------------------------------------------------------------------
2447 //
2448 // getMatchCallback
2449 //
2450 //--------------------------------------------------------------------------------
getMatchCallback(URegexMatchCallback * & callback,const void * & context,UErrorCode & status)2451 void RegexMatcher::getMatchCallback(URegexMatchCallback *&callback,
2452 const void *&context,
2453 UErrorCode &status) {
2454 if (U_FAILURE(status)) {
2455 return;
2456 }
2457 callback = fCallbackFn;
2458 context = fCallbackContext;
2459 }
2460
2461
2462 //--------------------------------------------------------------------------------
2463 //
2464 // setMatchCallback
2465 //
2466 //--------------------------------------------------------------------------------
setFindProgressCallback(URegexFindProgressCallback * callback,const void * context,UErrorCode & status)2467 void RegexMatcher::setFindProgressCallback(URegexFindProgressCallback *callback,
2468 const void *context,
2469 UErrorCode &status) {
2470 if (U_FAILURE(status)) {
2471 return;
2472 }
2473 fFindProgressCallbackFn = callback;
2474 fFindProgressCallbackContext = context;
2475 }
2476
2477
2478 //--------------------------------------------------------------------------------
2479 //
2480 // getMatchCallback
2481 //
2482 //--------------------------------------------------------------------------------
getFindProgressCallback(URegexFindProgressCallback * & callback,const void * & context,UErrorCode & status)2483 void RegexMatcher::getFindProgressCallback(URegexFindProgressCallback *&callback,
2484 const void *&context,
2485 UErrorCode &status) {
2486 if (U_FAILURE(status)) {
2487 return;
2488 }
2489 callback = fFindProgressCallbackFn;
2490 context = fFindProgressCallbackContext;
2491 }
2492
2493
2494 //================================================================================
2495 //
2496 // Code following this point in this file is the internal
2497 // Match Engine Implementation.
2498 //
2499 //================================================================================
2500
2501
2502 //--------------------------------------------------------------------------------
2503 //
2504 // resetStack
2505 // Discard any previous contents of the state save stack, and initialize a
2506 // new stack frame to all -1. The -1s are needed for capture group limits,
2507 // where they indicate that a group has not yet matched anything.
2508 //--------------------------------------------------------------------------------
resetStack()2509 REStackFrame *RegexMatcher::resetStack() {
2510 // Discard any previous contents of the state save stack, and initialize a
2511 // new stack frame with all -1 data. The -1s are needed for capture group limits,
2512 // where they indicate that a group has not yet matched anything.
2513 fStack->removeAllElements();
2514
2515 REStackFrame *iFrame = (REStackFrame *)fStack->reserveBlock(fPattern->fFrameSize, fDeferredStatus);
2516 if(U_FAILURE(fDeferredStatus)) {
2517 return nullptr;
2518 }
2519
2520 int32_t i;
2521 for (i=0; i<fPattern->fFrameSize-RESTACKFRAME_HDRCOUNT; i++) {
2522 iFrame->fExtra[i] = -1;
2523 }
2524 return iFrame;
2525 }
2526
2527
2528
2529 //--------------------------------------------------------------------------------
2530 //
2531 // isWordBoundary
2532 // in perl, "xab..cd..", \b is true at positions 0,3,5,7
2533 // For us,
2534 // If the current char is a combining mark,
2535 // \b is false.
2536 // Else Scan backwards to the first non-combining char.
2537 // We are at a boundary if the this char and the original chars are
2538 // opposite in membership in \w set
2539 //
2540 // parameters: pos - the current position in the input buffer
2541 //
2542 // TODO: double-check edge cases at region boundaries.
2543 //
2544 //--------------------------------------------------------------------------------
isWordBoundary(int64_t pos)2545 UBool RegexMatcher::isWordBoundary(int64_t pos) {
2546 UBool isBoundary = false;
2547 UBool cIsWord = false;
2548
2549 if (pos >= fLookLimit) {
2550 fHitEnd = true;
2551 } else {
2552 // Determine whether char c at current position is a member of the word set of chars.
2553 // If we're off the end of the string, behave as though we're not at a word char.
2554 UTEXT_SETNATIVEINDEX(fInputText, pos);
2555 UChar32 c = UTEXT_CURRENT32(fInputText);
2556 if (u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND) || u_charType(c) == U_FORMAT_CHAR) {
2557 // Current char is a combining one. Not a boundary.
2558 return false;
2559 }
2560 cIsWord = RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET].contains(c);
2561 }
2562
2563 // Back up until we come to a non-combining char, determine whether
2564 // that char is a word char.
2565 UBool prevCIsWord = false;
2566 for (;;) {
2567 if (UTEXT_GETNATIVEINDEX(fInputText) <= fLookStart) {
2568 break;
2569 }
2570 UChar32 prevChar = UTEXT_PREVIOUS32(fInputText);
2571 if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND)
2572 || u_charType(prevChar) == U_FORMAT_CHAR)) {
2573 prevCIsWord = RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET].contains(prevChar);
2574 break;
2575 }
2576 }
2577 isBoundary = cIsWord ^ prevCIsWord;
2578 return isBoundary;
2579 }
2580
isChunkWordBoundary(int32_t pos)2581 UBool RegexMatcher::isChunkWordBoundary(int32_t pos) {
2582 UBool isBoundary = false;
2583 UBool cIsWord = false;
2584
2585 const char16_t *inputBuf = fInputText->chunkContents;
2586
2587 if (pos >= fLookLimit) {
2588 fHitEnd = true;
2589 } else {
2590 // Determine whether char c at current position is a member of the word set of chars.
2591 // If we're off the end of the string, behave as though we're not at a word char.
2592 UChar32 c;
2593 U16_GET(inputBuf, fLookStart, pos, fLookLimit, c);
2594 if (u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND) || u_charType(c) == U_FORMAT_CHAR) {
2595 // Current char is a combining one. Not a boundary.
2596 return false;
2597 }
2598 cIsWord = RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET].contains(c);
2599 }
2600
2601 // Back up until we come to a non-combining char, determine whether
2602 // that char is a word char.
2603 UBool prevCIsWord = false;
2604 for (;;) {
2605 if (pos <= fLookStart) {
2606 break;
2607 }
2608 UChar32 prevChar;
2609 U16_PREV(inputBuf, fLookStart, pos, prevChar);
2610 if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND)
2611 || u_charType(prevChar) == U_FORMAT_CHAR)) {
2612 prevCIsWord = RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET].contains(prevChar);
2613 break;
2614 }
2615 }
2616 isBoundary = cIsWord ^ prevCIsWord;
2617 return isBoundary;
2618 }
2619
2620 //--------------------------------------------------------------------------------
2621 //
2622 // isUWordBoundary
2623 //
2624 // Test for a word boundary using RBBI word break.
2625 //
2626 // parameters: pos - the current position in the input buffer
2627 //
2628 //--------------------------------------------------------------------------------
isUWordBoundary(int64_t pos,UErrorCode & status)2629 UBool RegexMatcher::isUWordBoundary(int64_t pos, UErrorCode &status) {
2630 UBool returnVal = false;
2631
2632 #if UCONFIG_NO_BREAK_ITERATION==0
2633 // Note: this point will never be reached if break iteration is configured out.
2634 // Regex patterns that would require this function will fail to compile.
2635
2636 // If we haven't yet created a break iterator for this matcher, do it now.
2637 if (fWordBreakItr == nullptr) {
2638 fWordBreakItr = BreakIterator::createWordInstance(Locale::getEnglish(), status);
2639 if (U_FAILURE(status)) {
2640 return false;
2641 }
2642 fWordBreakItr->setText(fInputText, status);
2643 }
2644
2645 // Note: zero width boundary tests like \b see through transparent region bounds,
2646 // which is why fLookLimit is used here, rather than fActiveLimit.
2647 if (pos >= fLookLimit) {
2648 fHitEnd = true;
2649 returnVal = true; // With Unicode word rules, only positions within the interior of "real"
2650 // words are not boundaries. All non-word chars stand by themselves,
2651 // with word boundaries on both sides.
2652 } else {
2653 returnVal = fWordBreakItr->isBoundary((int32_t)pos);
2654 }
2655 #endif
2656 return returnVal;
2657 }
2658
2659
followingGCBoundary(int64_t pos,UErrorCode & status)2660 int64_t RegexMatcher::followingGCBoundary(int64_t pos, UErrorCode &status) {
2661 int64_t result = pos;
2662
2663 #if UCONFIG_NO_BREAK_ITERATION==0
2664 // Note: this point will never be reached if break iteration is configured out.
2665 // Regex patterns that would require this function will fail to compile.
2666
2667 // If we haven't yet created a break iterator for this matcher, do it now.
2668 if (fGCBreakItr == nullptr) {
2669 fGCBreakItr = BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
2670 if (U_FAILURE(status)) {
2671 return pos;
2672 }
2673 fGCBreakItr->setText(fInputText, status);
2674 }
2675 result = fGCBreakItr->following(pos);
2676 if (result == BreakIterator::DONE) {
2677 result = pos;
2678 }
2679 #endif
2680 return result;
2681 }
2682
2683 //--------------------------------------------------------------------------------
2684 //
2685 // IncrementTime This function is called once each TIMER_INITIAL_VALUE state
2686 // saves. Increment the "time" counter, and call the
2687 // user callback function if there is one installed.
2688 //
2689 // If the match operation needs to be aborted, either for a time-out
2690 // or because the user callback asked for it, just set an error status.
2691 // The engine will pick that up and stop in its outer loop.
2692 //
2693 //--------------------------------------------------------------------------------
IncrementTime(UErrorCode & status)2694 void RegexMatcher::IncrementTime(UErrorCode &status) {
2695 fTickCounter = TIMER_INITIAL_VALUE;
2696 fTime++;
2697 if (fCallbackFn != nullptr) {
2698 if ((*fCallbackFn)(fCallbackContext, fTime) == false) {
2699 status = U_REGEX_STOPPED_BY_CALLER;
2700 return;
2701 }
2702 }
2703 if (fTimeLimit > 0 && fTime >= fTimeLimit) {
2704 status = U_REGEX_TIME_OUT;
2705 }
2706 }
2707
2708 //--------------------------------------------------------------------------------
2709 //
2710 // StateSave
2711 // Make a new stack frame, initialized as a copy of the current stack frame.
2712 // Set the pattern index in the original stack frame from the operand value
2713 // in the opcode. Execution of the engine continues with the state in
2714 // the newly created stack frame
2715 //
2716 // Note that reserveBlock() may grow the stack, resulting in the
2717 // whole thing being relocated in memory.
2718 //
2719 // Parameters:
2720 // fp The top frame pointer when called. At return, a new
2721 // fame will be present
2722 // savePatIdx An index into the compiled pattern. Goes into the original
2723 // (not new) frame. If execution ever back-tracks out of the
2724 // new frame, this will be where we continue from in the pattern.
2725 // Return
2726 // The new frame pointer.
2727 //
2728 //--------------------------------------------------------------------------------
StateSave(REStackFrame * fp,int64_t savePatIdx,UErrorCode & status)2729 inline REStackFrame *RegexMatcher::StateSave(REStackFrame *fp, int64_t savePatIdx, UErrorCode &status) {
2730 if (U_FAILURE(status)) {
2731 return fp;
2732 }
2733 // push storage for a new frame.
2734 int64_t *newFP = fStack->reserveBlock(fFrameSize, status);
2735 if (U_FAILURE(status)) {
2736 // Failure on attempted stack expansion.
2737 // Stack function set some other error code, change it to a more
2738 // specific one for regular expressions.
2739 status = U_REGEX_STACK_OVERFLOW;
2740 // We need to return a writable stack frame, so just return the
2741 // previous frame. The match operation will stop quickly
2742 // because of the error status, after which the frame will never
2743 // be looked at again.
2744 return fp;
2745 }
2746 fp = (REStackFrame *)(newFP - fFrameSize); // in case of realloc of stack.
2747
2748 // New stack frame = copy of old top frame.
2749 int64_t *source = (int64_t *)fp;
2750 int64_t *dest = newFP;
2751 for (;;) {
2752 *dest++ = *source++;
2753 if (source == newFP) {
2754 break;
2755 }
2756 }
2757
2758 fTickCounter--;
2759 if (fTickCounter <= 0) {
2760 IncrementTime(status); // Re-initializes fTickCounter
2761 }
2762 fp->fPatIdx = savePatIdx;
2763 return (REStackFrame *)newFP;
2764 }
2765
2766 #if defined(REGEX_DEBUG)
2767 namespace {
StringFromUText(UText * ut)2768 UnicodeString StringFromUText(UText *ut) {
2769 UnicodeString result;
2770 for (UChar32 c = utext_next32From(ut, 0); c != U_SENTINEL; c = UTEXT_NEXT32(ut)) {
2771 result.append(c);
2772 }
2773 return result;
2774 }
2775 }
2776 #endif // REGEX_DEBUG
2777
2778
2779 //--------------------------------------------------------------------------------
2780 //
2781 // MatchAt This is the actual matching engine.
2782 //
2783 // startIdx: begin matching a this index.
2784 // toEnd: if true, match must extend to end of the input region
2785 //
2786 //--------------------------------------------------------------------------------
MatchAt(int64_t startIdx,UBool toEnd,UErrorCode & status)2787 void RegexMatcher::MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status) {
2788 UBool isMatch = false; // True if the we have a match.
2789
2790 int64_t backSearchIndex = U_INT64_MAX; // used after greedy single-character matches for searching backwards
2791
2792 int32_t op; // Operation from the compiled pattern, split into
2793 int32_t opType; // the opcode
2794 int32_t opValue; // and the operand value.
2795
2796 #ifdef REGEX_RUN_DEBUG
2797 if (fTraceDebug) {
2798 printf("MatchAt(startIdx=%ld)\n", startIdx);
2799 printf("Original Pattern: \"%s\"\n", CStr(StringFromUText(fPattern->fPattern))());
2800 printf("Input String: \"%s\"\n\n", CStr(StringFromUText(fInputText))());
2801 }
2802 #endif
2803
2804 if (U_FAILURE(status)) {
2805 return;
2806 }
2807
2808 // Cache frequently referenced items from the compiled pattern
2809 //
2810 int64_t *pat = fPattern->fCompiledPat->getBuffer();
2811
2812 const char16_t *litText = fPattern->fLiteralText.getBuffer();
2813 UVector *fSets = fPattern->fSets;
2814
2815 fFrameSize = fPattern->fFrameSize;
2816 REStackFrame *fp = resetStack();
2817 if (U_FAILURE(fDeferredStatus)) {
2818 status = fDeferredStatus;
2819 return;
2820 }
2821
2822 fp->fPatIdx = 0;
2823 fp->fInputIdx = startIdx;
2824
2825 // Zero out the pattern's static data
2826 int32_t i;
2827 for (i = 0; i<fPattern->fDataSize; i++) {
2828 fData[i] = 0;
2829 }
2830
2831 //
2832 // Main loop for interpreting the compiled pattern.
2833 // One iteration of the loop per pattern operation performed.
2834 //
2835 for (;;) {
2836 op = (int32_t)pat[fp->fPatIdx];
2837 opType = URX_TYPE(op);
2838 opValue = URX_VAL(op);
2839 #ifdef REGEX_RUN_DEBUG
2840 if (fTraceDebug) {
2841 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2842 printf("inputIdx=%ld inputChar=%x sp=%3ld activeLimit=%ld ", fp->fInputIdx,
2843 UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(), fActiveLimit);
2844 fPattern->dumpOp(fp->fPatIdx);
2845 }
2846 #endif
2847 fp->fPatIdx++;
2848
2849 switch (opType) {
2850
2851
2852 case URX_NOP:
2853 break;
2854
2855
2856 case URX_BACKTRACK:
2857 // Force a backtrack. In some circumstances, the pattern compiler
2858 // will notice that the pattern can't possibly match anything, and will
2859 // emit one of these at that point.
2860 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2861 break;
2862
2863
2864 case URX_ONECHAR:
2865 if (fp->fInputIdx < fActiveLimit) {
2866 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2867 UChar32 c = UTEXT_NEXT32(fInputText);
2868 if (c == opValue) {
2869 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
2870 break;
2871 }
2872 } else {
2873 fHitEnd = true;
2874 }
2875 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2876 break;
2877
2878
2879 case URX_STRING:
2880 {
2881 // Test input against a literal string.
2882 // Strings require two slots in the compiled pattern, one for the
2883 // offset to the string text, and one for the length.
2884
2885 int32_t stringStartIdx = opValue;
2886 op = (int32_t)pat[fp->fPatIdx]; // Fetch the second operand
2887 fp->fPatIdx++;
2888 opType = URX_TYPE(op);
2889 int32_t stringLen = URX_VAL(op);
2890 U_ASSERT(opType == URX_STRING_LEN);
2891 U_ASSERT(stringLen >= 2);
2892
2893 const char16_t *patternString = litText+stringStartIdx;
2894 int32_t patternStringIndex = 0;
2895 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2896 UChar32 inputChar;
2897 UChar32 patternChar;
2898 UBool success = true;
2899 while (patternStringIndex < stringLen) {
2900 if (UTEXT_GETNATIVEINDEX(fInputText) >= fActiveLimit) {
2901 success = false;
2902 fHitEnd = true;
2903 break;
2904 }
2905 inputChar = UTEXT_NEXT32(fInputText);
2906 U16_NEXT(patternString, patternStringIndex, stringLen, patternChar);
2907 if (patternChar != inputChar) {
2908 success = false;
2909 break;
2910 }
2911 }
2912
2913 if (success) {
2914 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
2915 } else {
2916 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2917 }
2918 }
2919 break;
2920
2921
2922 case URX_STATE_SAVE:
2923 fp = StateSave(fp, opValue, status);
2924 break;
2925
2926
2927 case URX_END:
2928 // The match loop will exit via this path on a successful match,
2929 // when we reach the end of the pattern.
2930 if (toEnd && fp->fInputIdx != fActiveLimit) {
2931 // The pattern matched, but not to the end of input. Try some more.
2932 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2933 break;
2934 }
2935 isMatch = true;
2936 goto breakFromLoop;
2937
2938 // Start and End Capture stack frame variables are laid out out like this:
2939 // fp->fExtra[opValue] - The start of a completed capture group
2940 // opValue+1 - The end of a completed capture group
2941 // opValue+2 - the start of a capture group whose end
2942 // has not yet been reached (and might not ever be).
2943 case URX_START_CAPTURE:
2944 U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
2945 fp->fExtra[opValue+2] = fp->fInputIdx;
2946 break;
2947
2948
2949 case URX_END_CAPTURE:
2950 U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
2951 U_ASSERT(fp->fExtra[opValue+2] >= 0); // Start pos for this group must be set.
2952 fp->fExtra[opValue] = fp->fExtra[opValue+2]; // Tentative start becomes real.
2953 fp->fExtra[opValue+1] = fp->fInputIdx; // End position
2954 U_ASSERT(fp->fExtra[opValue] <= fp->fExtra[opValue+1]);
2955 break;
2956
2957
2958 case URX_DOLLAR: // $, test for End of line
2959 // or for position before new line at end of input
2960 {
2961 if (fp->fInputIdx >= fAnchorLimit) {
2962 // We really are at the end of input. Success.
2963 fHitEnd = true;
2964 fRequireEnd = true;
2965 break;
2966 }
2967
2968 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2969
2970 // If we are positioned just before a new-line that is located at the
2971 // end of input, succeed.
2972 UChar32 c = UTEXT_NEXT32(fInputText);
2973 if (UTEXT_GETNATIVEINDEX(fInputText) >= fAnchorLimit) {
2974 if (isLineTerminator(c)) {
2975 // If not in the middle of a CR/LF sequence
2976 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && ((void)UTEXT_PREVIOUS32(fInputText), UTEXT_PREVIOUS32(fInputText))==0x0d)) {
2977 // At new-line at end of input. Success
2978 fHitEnd = true;
2979 fRequireEnd = true;
2980
2981 break;
2982 }
2983 }
2984 } else {
2985 UChar32 nextC = UTEXT_NEXT32(fInputText);
2986 if (c == 0x0d && nextC == 0x0a && UTEXT_GETNATIVEINDEX(fInputText) >= fAnchorLimit) {
2987 fHitEnd = true;
2988 fRequireEnd = true;
2989 break; // At CR/LF at end of input. Success
2990 }
2991 }
2992
2993 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2994 }
2995 break;
2996
2997
2998 case URX_DOLLAR_D: // $, test for End of Line, in UNIX_LINES mode.
2999 if (fp->fInputIdx >= fAnchorLimit) {
3000 // Off the end of input. Success.
3001 fHitEnd = true;
3002 fRequireEnd = true;
3003 break;
3004 } else {
3005 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3006 UChar32 c = UTEXT_NEXT32(fInputText);
3007 // Either at the last character of input, or off the end.
3008 if (c == 0x0a && UTEXT_GETNATIVEINDEX(fInputText) == fAnchorLimit) {
3009 fHitEnd = true;
3010 fRequireEnd = true;
3011 break;
3012 }
3013 }
3014
3015 // Not at end of input. Back-track out.
3016 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3017 break;
3018
3019
3020 case URX_DOLLAR_M: // $, test for End of line in multi-line mode
3021 {
3022 if (fp->fInputIdx >= fAnchorLimit) {
3023 // We really are at the end of input. Success.
3024 fHitEnd = true;
3025 fRequireEnd = true;
3026 break;
3027 }
3028 // If we are positioned just before a new-line, succeed.
3029 // It makes no difference where the new-line is within the input.
3030 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3031 UChar32 c = UTEXT_CURRENT32(fInputText);
3032 if (isLineTerminator(c)) {
3033 // At a line end, except for the odd chance of being in the middle of a CR/LF sequence
3034 // In multi-line mode, hitting a new-line just before the end of input does not
3035 // set the hitEnd or requireEnd flags
3036 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && UTEXT_PREVIOUS32(fInputText)==0x0d)) {
3037 break;
3038 }
3039 }
3040 // not at a new line. Fail.
3041 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3042 }
3043 break;
3044
3045
3046 case URX_DOLLAR_MD: // $, test for End of line in multi-line and UNIX_LINES mode
3047 {
3048 if (fp->fInputIdx >= fAnchorLimit) {
3049 // We really are at the end of input. Success.
3050 fHitEnd = true;
3051 fRequireEnd = true; // Java set requireEnd in this case, even though
3052 break; // adding a new-line would not lose the match.
3053 }
3054 // If we are not positioned just before a new-line, the test fails; backtrack out.
3055 // It makes no difference where the new-line is within the input.
3056 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3057 if (UTEXT_CURRENT32(fInputText) != 0x0a) {
3058 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3059 }
3060 }
3061 break;
3062
3063
3064 case URX_CARET: // ^, test for start of line
3065 if (fp->fInputIdx != fAnchorStart) {
3066 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3067 }
3068 break;
3069
3070
3071 case URX_CARET_M: // ^, test for start of line in mulit-line mode
3072 {
3073 if (fp->fInputIdx == fAnchorStart) {
3074 // We are at the start input. Success.
3075 break;
3076 }
3077 // Check whether character just before the current pos is a new-line
3078 // unless we are at the end of input
3079 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3080 UChar32 c = UTEXT_PREVIOUS32(fInputText);
3081 if ((fp->fInputIdx < fAnchorLimit) && isLineTerminator(c)) {
3082 // It's a new-line. ^ is true. Success.
3083 // TODO: what should be done with positions between a CR and LF?
3084 break;
3085 }
3086 // Not at the start of a line. Fail.
3087 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3088 }
3089 break;
3090
3091
3092 case URX_CARET_M_UNIX: // ^, test for start of line in mulit-line + Unix-line mode
3093 {
3094 U_ASSERT(fp->fInputIdx >= fAnchorStart);
3095 if (fp->fInputIdx <= fAnchorStart) {
3096 // We are at the start input. Success.
3097 break;
3098 }
3099 // Check whether character just before the current pos is a new-line
3100 U_ASSERT(fp->fInputIdx <= fAnchorLimit);
3101 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3102 UChar32 c = UTEXT_PREVIOUS32(fInputText);
3103 if (c != 0x0a) {
3104 // Not at the start of a line. Back-track out.
3105 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3106 }
3107 }
3108 break;
3109
3110 case URX_BACKSLASH_B: // Test for word boundaries
3111 {
3112 UBool success = isWordBoundary(fp->fInputIdx);
3113 success ^= (UBool)(opValue != 0); // flip sense for \B
3114 if (!success) {
3115 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3116 }
3117 }
3118 break;
3119
3120
3121 case URX_BACKSLASH_BU: // Test for word boundaries, Unicode-style
3122 {
3123 UBool success = isUWordBoundary(fp->fInputIdx, status);
3124 success ^= (UBool)(opValue != 0); // flip sense for \B
3125 if (!success) {
3126 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3127 }
3128 }
3129 break;
3130
3131
3132 case URX_BACKSLASH_D: // Test for decimal digit
3133 {
3134 if (fp->fInputIdx >= fActiveLimit) {
3135 fHitEnd = true;
3136 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3137 break;
3138 }
3139
3140 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3141
3142 UChar32 c = UTEXT_NEXT32(fInputText);
3143 int8_t ctype = u_charType(c); // TODO: make a unicode set for this. Will be faster.
3144 UBool success = (ctype == U_DECIMAL_DIGIT_NUMBER);
3145 success ^= (UBool)(opValue != 0); // flip sense for \D
3146 if (success) {
3147 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3148 } else {
3149 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3150 }
3151 }
3152 break;
3153
3154
3155 case URX_BACKSLASH_G: // Test for position at end of previous match
3156 if (!((fMatch && fp->fInputIdx==fMatchEnd) || (fMatch==false && fp->fInputIdx==fActiveStart))) {
3157 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3158 }
3159 break;
3160
3161
3162 case URX_BACKSLASH_H: // Test for \h, horizontal white space.
3163 {
3164 if (fp->fInputIdx >= fActiveLimit) {
3165 fHitEnd = true;
3166 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3167 break;
3168 }
3169 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3170 UChar32 c = UTEXT_NEXT32(fInputText);
3171 int8_t ctype = u_charType(c);
3172 UBool success = (ctype == U_SPACE_SEPARATOR || c == 9); // SPACE_SEPARATOR || TAB
3173 success ^= (UBool)(opValue != 0); // flip sense for \H
3174 if (success) {
3175 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3176 } else {
3177 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3178 }
3179 }
3180 break;
3181
3182
3183 case URX_BACKSLASH_R: // Test for \R, any line break sequence.
3184 {
3185 if (fp->fInputIdx >= fActiveLimit) {
3186 fHitEnd = true;
3187 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3188 break;
3189 }
3190 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3191 UChar32 c = UTEXT_NEXT32(fInputText);
3192 if (isLineTerminator(c)) {
3193 if (c == 0x0d && utext_current32(fInputText) == 0x0a) {
3194 utext_next32(fInputText);
3195 }
3196 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3197 } else {
3198 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3199 }
3200 }
3201 break;
3202
3203
3204 case URX_BACKSLASH_V: // \v, any single line ending character.
3205 {
3206 if (fp->fInputIdx >= fActiveLimit) {
3207 fHitEnd = true;
3208 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3209 break;
3210 }
3211 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3212 UChar32 c = UTEXT_NEXT32(fInputText);
3213 UBool success = isLineTerminator(c);
3214 success ^= (UBool)(opValue != 0); // flip sense for \V
3215 if (success) {
3216 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3217 } else {
3218 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3219 }
3220 }
3221 break;
3222
3223
3224 case URX_BACKSLASH_X:
3225 // Match a Grapheme, as defined by Unicode UAX 29.
3226
3227 // Fail if at end of input
3228 if (fp->fInputIdx >= fActiveLimit) {
3229 fHitEnd = true;
3230 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3231 break;
3232 }
3233
3234 fp->fInputIdx = followingGCBoundary(fp->fInputIdx, status);
3235 if (fp->fInputIdx >= fActiveLimit) {
3236 fHitEnd = true;
3237 fp->fInputIdx = fActiveLimit;
3238 }
3239 break;
3240
3241
3242 case URX_BACKSLASH_Z: // Test for end of Input
3243 if (fp->fInputIdx < fAnchorLimit) {
3244 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3245 } else {
3246 fHitEnd = true;
3247 fRequireEnd = true;
3248 }
3249 break;
3250
3251
3252
3253 case URX_STATIC_SETREF:
3254 {
3255 // Test input character against one of the predefined sets
3256 // (Word Characters, for example)
3257 // The high bit of the op value is a flag for the match polarity.
3258 // 0: success if input char is in set.
3259 // 1: success if input char is not in set.
3260 if (fp->fInputIdx >= fActiveLimit) {
3261 fHitEnd = true;
3262 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3263 break;
3264 }
3265
3266 UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET);
3267 opValue &= ~URX_NEG_SET;
3268 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
3269
3270 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3271 UChar32 c = UTEXT_NEXT32(fInputText);
3272 if (c < 256) {
3273 Regex8BitSet &s8 = RegexStaticSets::gStaticSets->fPropSets8[opValue];
3274 if (s8.contains(c)) {
3275 success = !success;
3276 }
3277 } else {
3278 const UnicodeSet &s = RegexStaticSets::gStaticSets->fPropSets[opValue];
3279 if (s.contains(c)) {
3280 success = !success;
3281 }
3282 }
3283 if (success) {
3284 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3285 } else {
3286 // the character wasn't in the set.
3287 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3288 }
3289 }
3290 break;
3291
3292
3293 case URX_STAT_SETREF_N:
3294 {
3295 // Test input character for NOT being a member of one of
3296 // the predefined sets (Word Characters, for example)
3297 if (fp->fInputIdx >= fActiveLimit) {
3298 fHitEnd = true;
3299 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3300 break;
3301 }
3302
3303 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
3304
3305 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3306
3307 UChar32 c = UTEXT_NEXT32(fInputText);
3308 if (c < 256) {
3309 Regex8BitSet &s8 = RegexStaticSets::gStaticSets->fPropSets8[opValue];
3310 if (s8.contains(c) == false) {
3311 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3312 break;
3313 }
3314 } else {
3315 const UnicodeSet &s = RegexStaticSets::gStaticSets->fPropSets[opValue];
3316 if (s.contains(c) == false) {
3317 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3318 break;
3319 }
3320 }
3321 // the character wasn't in the set.
3322 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3323 }
3324 break;
3325
3326
3327 case URX_SETREF:
3328 if (fp->fInputIdx >= fActiveLimit) {
3329 fHitEnd = true;
3330 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3331 break;
3332 } else {
3333 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3334
3335 // There is input left. Pick up one char and test it for set membership.
3336 UChar32 c = UTEXT_NEXT32(fInputText);
3337 U_ASSERT(opValue > 0 && opValue < fSets->size());
3338 if (c<256) {
3339 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
3340 if (s8->contains(c)) {
3341 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3342 break;
3343 }
3344 } else {
3345 UnicodeSet *s = (UnicodeSet *)fSets->elementAt(opValue);
3346 if (s->contains(c)) {
3347 // The character is in the set. A Match.
3348 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3349 break;
3350 }
3351 }
3352
3353 // the character wasn't in the set.
3354 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3355 }
3356 break;
3357
3358
3359 case URX_DOTANY:
3360 {
3361 // . matches anything, but stops at end-of-line.
3362 if (fp->fInputIdx >= fActiveLimit) {
3363 // At end of input. Match failed. Backtrack out.
3364 fHitEnd = true;
3365 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3366 break;
3367 }
3368
3369 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3370
3371 // There is input left. Advance over one char, unless we've hit end-of-line
3372 UChar32 c = UTEXT_NEXT32(fInputText);
3373 if (isLineTerminator(c)) {
3374 // End of line in normal mode. . does not match.
3375 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3376 break;
3377 }
3378 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3379 }
3380 break;
3381
3382
3383 case URX_DOTANY_ALL:
3384 {
3385 // ., in dot-matches-all (including new lines) mode
3386 if (fp->fInputIdx >= fActiveLimit) {
3387 // At end of input. Match failed. Backtrack out.
3388 fHitEnd = true;
3389 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3390 break;
3391 }
3392
3393 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3394
3395 // There is input left. Advance over one char, except if we are
3396 // at a cr/lf, advance over both of them.
3397 UChar32 c;
3398 c = UTEXT_NEXT32(fInputText);
3399 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3400 if (c==0x0d && fp->fInputIdx < fActiveLimit) {
3401 // In the case of a CR/LF, we need to advance over both.
3402 UChar32 nextc = UTEXT_CURRENT32(fInputText);
3403 if (nextc == 0x0a) {
3404 (void)UTEXT_NEXT32(fInputText);
3405 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3406 }
3407 }
3408 }
3409 break;
3410
3411
3412 case URX_DOTANY_UNIX:
3413 {
3414 // '.' operator, matches all, but stops at end-of-line.
3415 // UNIX_LINES mode, so 0x0a is the only recognized line ending.
3416 if (fp->fInputIdx >= fActiveLimit) {
3417 // At end of input. Match failed. Backtrack out.
3418 fHitEnd = true;
3419 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3420 break;
3421 }
3422
3423 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3424
3425 // There is input left. Advance over one char, unless we've hit end-of-line
3426 UChar32 c = UTEXT_NEXT32(fInputText);
3427 if (c == 0x0a) {
3428 // End of line in normal mode. '.' does not match the \n
3429 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3430 } else {
3431 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3432 }
3433 }
3434 break;
3435
3436
3437 case URX_JMP:
3438 fp->fPatIdx = opValue;
3439 break;
3440
3441 case URX_FAIL:
3442 isMatch = false;
3443 goto breakFromLoop;
3444
3445 case URX_JMP_SAV:
3446 U_ASSERT(opValue < fPattern->fCompiledPat->size());
3447 fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current
3448 fp->fPatIdx = opValue; // Then JMP.
3449 break;
3450
3451 case URX_JMP_SAV_X:
3452 // This opcode is used with (x)+, when x can match a zero length string.
3453 // Same as JMP_SAV, except conditional on the match having made forward progress.
3454 // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the
3455 // data address of the input position at the start of the loop.
3456 {
3457 U_ASSERT(opValue > 0 && opValue < fPattern->fCompiledPat->size());
3458 int32_t stoOp = (int32_t)pat[opValue-1];
3459 U_ASSERT(URX_TYPE(stoOp) == URX_STO_INP_LOC);
3460 int32_t frameLoc = URX_VAL(stoOp);
3461 U_ASSERT(frameLoc >= 0 && frameLoc < fFrameSize);
3462 int64_t prevInputIdx = fp->fExtra[frameLoc];
3463 U_ASSERT(prevInputIdx <= fp->fInputIdx);
3464 if (prevInputIdx < fp->fInputIdx) {
3465 // The match did make progress. Repeat the loop.
3466 fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current
3467 fp->fPatIdx = opValue;
3468 fp->fExtra[frameLoc] = fp->fInputIdx;
3469 }
3470 // If the input position did not advance, we do nothing here,
3471 // execution will fall out of the loop.
3472 }
3473 break;
3474
3475 case URX_CTR_INIT:
3476 {
3477 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
3478 fp->fExtra[opValue] = 0; // Set the loop counter variable to zero
3479
3480 // Pick up the three extra operands that CTR_INIT has, and
3481 // skip the pattern location counter past
3482 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
3483 fp->fPatIdx += 3;
3484 int32_t loopLoc = URX_VAL(pat[instrOperandLoc]);
3485 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
3486 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
3487 U_ASSERT(minCount>=0);
3488 U_ASSERT(maxCount>=minCount || maxCount==-1);
3489 U_ASSERT(loopLoc>=fp->fPatIdx);
3490
3491 if (minCount == 0) {
3492 fp = StateSave(fp, loopLoc+1, status);
3493 }
3494 if (maxCount == -1) {
3495 fp->fExtra[opValue+1] = fp->fInputIdx; // For loop breaking.
3496 } else if (maxCount == 0) {
3497 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3498 }
3499 }
3500 break;
3501
3502 case URX_CTR_LOOP:
3503 {
3504 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
3505 int32_t initOp = (int32_t)pat[opValue];
3506 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT);
3507 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
3508 int32_t minCount = (int32_t)pat[opValue+2];
3509 int32_t maxCount = (int32_t)pat[opValue+3];
3510 (*pCounter)++;
3511 if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) {
3512 U_ASSERT(*pCounter == maxCount);
3513 break;
3514 }
3515 if (*pCounter >= minCount) {
3516 if (maxCount == -1) {
3517 // Loop has no hard upper bound.
3518 // Check that it is progressing through the input, break if it is not.
3519 int64_t *pLastInputIdx = &fp->fExtra[URX_VAL(initOp) + 1];
3520 if (fp->fInputIdx == *pLastInputIdx) {
3521 break;
3522 } else {
3523 *pLastInputIdx = fp->fInputIdx;
3524 }
3525 }
3526 fp = StateSave(fp, fp->fPatIdx, status);
3527 } else {
3528 // Increment time-out counter. (StateSave() does it if count >= minCount)
3529 fTickCounter--;
3530 if (fTickCounter <= 0) {
3531 IncrementTime(status); // Re-initializes fTickCounter
3532 }
3533 }
3534
3535 fp->fPatIdx = opValue + 4; // Loop back.
3536 }
3537 break;
3538
3539 case URX_CTR_INIT_NG:
3540 {
3541 // Initialize a non-greedy loop
3542 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
3543 fp->fExtra[opValue] = 0; // Set the loop counter variable to zero
3544
3545 // Pick up the three extra operands that CTR_INIT_NG has, and
3546 // skip the pattern location counter past
3547 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
3548 fp->fPatIdx += 3;
3549 int32_t loopLoc = URX_VAL(pat[instrOperandLoc]);
3550 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
3551 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
3552 U_ASSERT(minCount>=0);
3553 U_ASSERT(maxCount>=minCount || maxCount==-1);
3554 U_ASSERT(loopLoc>fp->fPatIdx);
3555 if (maxCount == -1) {
3556 fp->fExtra[opValue+1] = fp->fInputIdx; // Save initial input index for loop breaking.
3557 }
3558
3559 if (minCount == 0) {
3560 if (maxCount != 0) {
3561 fp = StateSave(fp, fp->fPatIdx, status);
3562 }
3563 fp->fPatIdx = loopLoc+1; // Continue with stuff after repeated block
3564 }
3565 }
3566 break;
3567
3568 case URX_CTR_LOOP_NG:
3569 {
3570 // Non-greedy {min, max} loops
3571 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
3572 int32_t initOp = (int32_t)pat[opValue];
3573 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT_NG);
3574 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
3575 int32_t minCount = (int32_t)pat[opValue+2];
3576 int32_t maxCount = (int32_t)pat[opValue+3];
3577
3578 (*pCounter)++;
3579 if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) {
3580 // The loop has matched the maximum permitted number of times.
3581 // Break out of here with no action. Matching will
3582 // continue with the following pattern.
3583 U_ASSERT(*pCounter == maxCount);
3584 break;
3585 }
3586
3587 if (*pCounter < minCount) {
3588 // We haven't met the minimum number of matches yet.
3589 // Loop back for another one.
3590 fp->fPatIdx = opValue + 4; // Loop back.
3591 // Increment time-out counter. (StateSave() does it if count >= minCount)
3592 fTickCounter--;
3593 if (fTickCounter <= 0) {
3594 IncrementTime(status); // Re-initializes fTickCounter
3595 }
3596 } else {
3597 // We do have the minimum number of matches.
3598
3599 // If there is no upper bound on the loop iterations, check that the input index
3600 // is progressing, and stop the loop if it is not.
3601 if (maxCount == -1) {
3602 int64_t *pLastInputIdx = &fp->fExtra[URX_VAL(initOp) + 1];
3603 if (fp->fInputIdx == *pLastInputIdx) {
3604 break;
3605 }
3606 *pLastInputIdx = fp->fInputIdx;
3607 }
3608
3609 // Loop Continuation: we will fall into the pattern following the loop
3610 // (non-greedy, don't execute loop body first), but first do
3611 // a state save to the top of the loop, so that a match failure
3612 // in the following pattern will try another iteration of the loop.
3613 fp = StateSave(fp, opValue + 4, status);
3614 }
3615 }
3616 break;
3617
3618 case URX_STO_SP:
3619 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
3620 fData[opValue] = fStack->size();
3621 break;
3622
3623 case URX_LD_SP:
3624 {
3625 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
3626 int32_t newStackSize = (int32_t)fData[opValue];
3627 U_ASSERT(newStackSize <= fStack->size());
3628 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
3629 if (newFP == (int64_t *)fp) {
3630 break;
3631 }
3632 int32_t j;
3633 for (j=0; j<fFrameSize; j++) {
3634 newFP[j] = ((int64_t *)fp)[j];
3635 }
3636 fp = (REStackFrame *)newFP;
3637 fStack->setSize(newStackSize);
3638 }
3639 break;
3640
3641 case URX_BACKREF:
3642 {
3643 U_ASSERT(opValue < fFrameSize);
3644 int64_t groupStartIdx = fp->fExtra[opValue];
3645 int64_t groupEndIdx = fp->fExtra[opValue+1];
3646 U_ASSERT(groupStartIdx <= groupEndIdx);
3647 if (groupStartIdx < 0) {
3648 // This capture group has not participated in the match thus far,
3649 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match.
3650 break;
3651 }
3652 UTEXT_SETNATIVEINDEX(fAltInputText, groupStartIdx);
3653 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3654
3655 // Note: if the capture group match was of an empty string the backref
3656 // match succeeds. Verified by testing: Perl matches succeed
3657 // in this case, so we do too.
3658
3659 UBool success = true;
3660 for (;;) {
3661 if (utext_getNativeIndex(fAltInputText) >= groupEndIdx) {
3662 success = true;
3663 break;
3664 }
3665 if (utext_getNativeIndex(fInputText) >= fActiveLimit) {
3666 success = false;
3667 fHitEnd = true;
3668 break;
3669 }
3670 UChar32 captureGroupChar = utext_next32(fAltInputText);
3671 UChar32 inputChar = utext_next32(fInputText);
3672 if (inputChar != captureGroupChar) {
3673 success = false;
3674 break;
3675 }
3676 }
3677
3678 if (success) {
3679 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3680 } else {
3681 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3682 }
3683 }
3684 break;
3685
3686
3687
3688 case URX_BACKREF_I:
3689 {
3690 U_ASSERT(opValue < fFrameSize);
3691 int64_t groupStartIdx = fp->fExtra[opValue];
3692 int64_t groupEndIdx = fp->fExtra[opValue+1];
3693 U_ASSERT(groupStartIdx <= groupEndIdx);
3694 if (groupStartIdx < 0) {
3695 // This capture group has not participated in the match thus far,
3696 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match.
3697 break;
3698 }
3699 utext_setNativeIndex(fAltInputText, groupStartIdx);
3700 utext_setNativeIndex(fInputText, fp->fInputIdx);
3701 CaseFoldingUTextIterator captureGroupItr(*fAltInputText);
3702 CaseFoldingUTextIterator inputItr(*fInputText);
3703
3704 // Note: if the capture group match was of an empty string the backref
3705 // match succeeds. Verified by testing: Perl matches succeed
3706 // in this case, so we do too.
3707
3708 UBool success = true;
3709 for (;;) {
3710 if (!captureGroupItr.inExpansion() && utext_getNativeIndex(fAltInputText) >= groupEndIdx) {
3711 success = true;
3712 break;
3713 }
3714 if (!inputItr.inExpansion() && utext_getNativeIndex(fInputText) >= fActiveLimit) {
3715 success = false;
3716 fHitEnd = true;
3717 break;
3718 }
3719 UChar32 captureGroupChar = captureGroupItr.next();
3720 UChar32 inputChar = inputItr.next();
3721 if (inputChar != captureGroupChar) {
3722 success = false;
3723 break;
3724 }
3725 }
3726
3727 if (success && inputItr.inExpansion()) {
3728 // We obtained a match by consuming part of a string obtained from
3729 // case-folding a single code point of the input text.
3730 // This does not count as an overall match.
3731 success = false;
3732 }
3733
3734 if (success) {
3735 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3736 } else {
3737 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3738 }
3739
3740 }
3741 break;
3742
3743 case URX_STO_INP_LOC:
3744 {
3745 U_ASSERT(opValue >= 0 && opValue < fFrameSize);
3746 fp->fExtra[opValue] = fp->fInputIdx;
3747 }
3748 break;
3749
3750 case URX_JMPX:
3751 {
3752 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
3753 fp->fPatIdx += 1;
3754 int32_t dataLoc = URX_VAL(pat[instrOperandLoc]);
3755 U_ASSERT(dataLoc >= 0 && dataLoc < fFrameSize);
3756 int64_t savedInputIdx = fp->fExtra[dataLoc];
3757 U_ASSERT(savedInputIdx <= fp->fInputIdx);
3758 if (savedInputIdx < fp->fInputIdx) {
3759 fp->fPatIdx = opValue; // JMP
3760 } else {
3761 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no progress in loop.
3762 }
3763 }
3764 break;
3765
3766 case URX_LA_START:
3767 {
3768 // Entering a look around block.
3769 // Save Stack Ptr, Input Pos.
3770 U_ASSERT(opValue>=0 && opValue+3<fPattern->fDataSize);
3771 fData[opValue] = fStack->size();
3772 fData[opValue+1] = fp->fInputIdx;
3773 fData[opValue+2] = fActiveStart;
3774 fData[opValue+3] = fActiveLimit;
3775 fActiveStart = fLookStart; // Set the match region change for
3776 fActiveLimit = fLookLimit; // transparent bounds.
3777 }
3778 break;
3779
3780 case URX_LA_END:
3781 {
3782 // Leaving a look-ahead block.
3783 // restore Stack Ptr, Input Pos to positions they had on entry to block.
3784 U_ASSERT(opValue>=0 && opValue+3<fPattern->fDataSize);
3785 int32_t stackSize = fStack->size();
3786 int32_t newStackSize =(int32_t)fData[opValue];
3787 U_ASSERT(stackSize >= newStackSize);
3788 if (stackSize > newStackSize) {
3789 // Copy the current top frame back to the new (cut back) top frame.
3790 // This makes the capture groups from within the look-ahead
3791 // expression available.
3792 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
3793 int32_t j;
3794 for (j=0; j<fFrameSize; j++) {
3795 newFP[j] = ((int64_t *)fp)[j];
3796 }
3797 fp = (REStackFrame *)newFP;
3798 fStack->setSize(newStackSize);
3799 }
3800 fp->fInputIdx = fData[opValue+1];
3801
3802 // Restore the active region bounds in the input string; they may have
3803 // been changed because of transparent bounds on a Region.
3804 fActiveStart = fData[opValue+2];
3805 fActiveLimit = fData[opValue+3];
3806 U_ASSERT(fActiveStart >= 0);
3807 U_ASSERT(fActiveLimit <= fInputLength);
3808 }
3809 break;
3810
3811 case URX_ONECHAR_I:
3812 // Case insensitive one char. The char from the pattern is already case folded.
3813 // Input text is not, but case folding the input can not reduce two or more code
3814 // points to one.
3815 if (fp->fInputIdx < fActiveLimit) {
3816 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3817
3818 UChar32 c = UTEXT_NEXT32(fInputText);
3819 if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) {
3820 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3821 break;
3822 }
3823 } else {
3824 fHitEnd = true;
3825 }
3826
3827 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3828 break;
3829
3830 case URX_STRING_I:
3831 {
3832 // Case-insensitive test input against a literal string.
3833 // Strings require two slots in the compiled pattern, one for the
3834 // offset to the string text, and one for the length.
3835 // The compiled string has already been case folded.
3836 {
3837 const char16_t *patternString = litText + opValue;
3838 int32_t patternStringIdx = 0;
3839
3840 op = (int32_t)pat[fp->fPatIdx];
3841 fp->fPatIdx++;
3842 opType = URX_TYPE(op);
3843 opValue = URX_VAL(op);
3844 U_ASSERT(opType == URX_STRING_LEN);
3845 int32_t patternStringLen = opValue; // Length of the string from the pattern.
3846
3847
3848 UChar32 cPattern;
3849 UChar32 cText;
3850 UBool success = true;
3851
3852 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3853 CaseFoldingUTextIterator inputIterator(*fInputText);
3854 while (patternStringIdx < patternStringLen) {
3855 if (!inputIterator.inExpansion() && UTEXT_GETNATIVEINDEX(fInputText) >= fActiveLimit) {
3856 success = false;
3857 fHitEnd = true;
3858 break;
3859 }
3860 U16_NEXT(patternString, patternStringIdx, patternStringLen, cPattern);
3861 cText = inputIterator.next();
3862 if (cText != cPattern) {
3863 success = false;
3864 break;
3865 }
3866 }
3867 if (inputIterator.inExpansion()) {
3868 success = false;
3869 }
3870
3871 if (success) {
3872 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3873 } else {
3874 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3875 }
3876 }
3877 }
3878 break;
3879
3880 case URX_LB_START:
3881 {
3882 // Entering a look-behind block.
3883 // Save Stack Ptr, Input Pos and active input region.
3884 // TODO: implement transparent bounds. Ticket #6067
3885 U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize);
3886 fData[opValue] = fStack->size();
3887 fData[opValue+1] = fp->fInputIdx;
3888 // Save input string length, then reset to pin any matches to end at
3889 // the current position.
3890 fData[opValue+2] = fActiveStart;
3891 fData[opValue+3] = fActiveLimit;
3892 fActiveStart = fRegionStart;
3893 fActiveLimit = fp->fInputIdx;
3894 // Init the variable containing the start index for attempted matches.
3895 fData[opValue+4] = -1;
3896 }
3897 break;
3898
3899
3900 case URX_LB_CONT:
3901 {
3902 // Positive Look-Behind, at top of loop checking for matches of LB expression
3903 // at all possible input starting positions.
3904
3905 // Fetch the min and max possible match lengths. They are the operands
3906 // of this op in the pattern.
3907 int32_t minML = (int32_t)pat[fp->fPatIdx++];
3908 int32_t maxML = (int32_t)pat[fp->fPatIdx++];
3909 if (!UTEXT_USES_U16(fInputText)) {
3910 // utf-8 fix to maximum match length. The pattern compiler assumes utf-16.
3911 // The max length need not be exact; it just needs to be >= actual maximum.
3912 maxML *= 3;
3913 }
3914 U_ASSERT(minML <= maxML);
3915 U_ASSERT(minML >= 0);
3916
3917 // Fetch (from data) the last input index where a match was attempted.
3918 U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize);
3919 int64_t &lbStartIdx = fData[opValue+4];
3920 if (lbStartIdx < 0) {
3921 // First time through loop.
3922 lbStartIdx = fp->fInputIdx - minML;
3923 if (lbStartIdx > 0) {
3924 // move index to a code point boundary, if it's not on one already.
3925 UTEXT_SETNATIVEINDEX(fInputText, lbStartIdx);
3926 lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
3927 }
3928 } else {
3929 // 2nd through nth time through the loop.
3930 // Back up start position for match by one.
3931 if (lbStartIdx == 0) {
3932 (lbStartIdx)--;
3933 } else {
3934 UTEXT_SETNATIVEINDEX(fInputText, lbStartIdx);
3935 (void)UTEXT_PREVIOUS32(fInputText);
3936 lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
3937 }
3938 }
3939
3940 if (lbStartIdx < 0 || lbStartIdx < fp->fInputIdx - maxML) {
3941 // We have tried all potential match starting points without
3942 // getting a match. Backtrack out, and out of the
3943 // Look Behind altogether.
3944 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3945 fActiveStart = fData[opValue+2];
3946 fActiveLimit = fData[opValue+3];
3947 U_ASSERT(fActiveStart >= 0);
3948 U_ASSERT(fActiveLimit <= fInputLength);
3949 break;
3950 }
3951
3952 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
3953 // (successful match will fall off the end of the loop.)
3954 fp = StateSave(fp, fp->fPatIdx-3, status);
3955 fp->fInputIdx = lbStartIdx;
3956 }
3957 break;
3958
3959 case URX_LB_END:
3960 // End of a look-behind block, after a successful match.
3961 {
3962 U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize);
3963 if (fp->fInputIdx != fActiveLimit) {
3964 // The look-behind expression matched, but the match did not
3965 // extend all the way to the point that we are looking behind from.
3966 // FAIL out of here, which will take us back to the LB_CONT, which
3967 // will retry the match starting at another position or fail
3968 // the look-behind altogether, whichever is appropriate.
3969 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3970 break;
3971 }
3972
3973 // Look-behind match is good. Restore the original input string region,
3974 // which had been truncated to pin the end of the lookbehind match to the
3975 // position being looked-behind.
3976 fActiveStart = fData[opValue+2];
3977 fActiveLimit = fData[opValue+3];
3978 U_ASSERT(fActiveStart >= 0);
3979 U_ASSERT(fActiveLimit <= fInputLength);
3980 }
3981 break;
3982
3983
3984 case URX_LBN_CONT:
3985 {
3986 // Negative Look-Behind, at top of loop checking for matches of LB expression
3987 // at all possible input starting positions.
3988
3989 // Fetch the extra parameters of this op.
3990 int32_t minML = (int32_t)pat[fp->fPatIdx++];
3991 int32_t maxML = (int32_t)pat[fp->fPatIdx++];
3992 if (!UTEXT_USES_U16(fInputText)) {
3993 // utf-8 fix to maximum match length. The pattern compiler assumes utf-16.
3994 // The max length need not be exact; it just needs to be >= actual maximum.
3995 maxML *= 3;
3996 }
3997 int32_t continueLoc = (int32_t)pat[fp->fPatIdx++];
3998 continueLoc = URX_VAL(continueLoc);
3999 U_ASSERT(minML <= maxML);
4000 U_ASSERT(minML >= 0);
4001 U_ASSERT(continueLoc > fp->fPatIdx);
4002
4003 // Fetch (from data) the last input index where a match was attempted.
4004 U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize);
4005 int64_t &lbStartIdx = fData[opValue+4];
4006 if (lbStartIdx < 0) {
4007 // First time through loop.
4008 lbStartIdx = fp->fInputIdx - minML;
4009 if (lbStartIdx > 0) {
4010 // move index to a code point boundary, if it's not on one already.
4011 UTEXT_SETNATIVEINDEX(fInputText, lbStartIdx);
4012 lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
4013 }
4014 } else {
4015 // 2nd through nth time through the loop.
4016 // Back up start position for match by one.
4017 if (lbStartIdx == 0) {
4018 (lbStartIdx)--;
4019 } else {
4020 UTEXT_SETNATIVEINDEX(fInputText, lbStartIdx);
4021 (void)UTEXT_PREVIOUS32(fInputText);
4022 lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
4023 }
4024 }
4025
4026 if (lbStartIdx < 0 || lbStartIdx < fp->fInputIdx - maxML) {
4027 // We have tried all potential match starting points without
4028 // getting a match, which means that the negative lookbehind as
4029 // a whole has succeeded. Jump forward to the continue location
4030 fActiveStart = fData[opValue+2];
4031 fActiveLimit = fData[opValue+3];
4032 U_ASSERT(fActiveStart >= 0);
4033 U_ASSERT(fActiveLimit <= fInputLength);
4034 fp->fPatIdx = continueLoc;
4035 break;
4036 }
4037
4038 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
4039 // (successful match will cause a FAIL out of the loop altogether.)
4040 fp = StateSave(fp, fp->fPatIdx-4, status);
4041 fp->fInputIdx = lbStartIdx;
4042 }
4043 break;
4044
4045 case URX_LBN_END:
4046 // End of a negative look-behind block, after a successful match.
4047 {
4048 U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize);
4049 if (fp->fInputIdx != fActiveLimit) {
4050 // The look-behind expression matched, but the match did not
4051 // extend all the way to the point that we are looking behind from.
4052 // FAIL out of here, which will take us back to the LB_CONT, which
4053 // will retry the match starting at another position or succeed
4054 // the look-behind altogether, whichever is appropriate.
4055 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4056 break;
4057 }
4058
4059 // Look-behind expression matched, which means look-behind test as
4060 // a whole Fails
4061
4062 // Restore the original input string length, which had been truncated
4063 // inorder to pin the end of the lookbehind match
4064 // to the position being looked-behind.
4065 fActiveStart = fData[opValue+2];
4066 fActiveLimit = fData[opValue+3];
4067 U_ASSERT(fActiveStart >= 0);
4068 U_ASSERT(fActiveLimit <= fInputLength);
4069
4070 // Restore original stack position, discarding any state saved
4071 // by the successful pattern match.
4072 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
4073 int32_t newStackSize = (int32_t)fData[opValue];
4074 U_ASSERT(fStack->size() > newStackSize);
4075 fStack->setSize(newStackSize);
4076
4077 // FAIL, which will take control back to someplace
4078 // prior to entering the look-behind test.
4079 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4080 }
4081 break;
4082
4083
4084 case URX_LOOP_SR_I:
4085 // Loop Initialization for the optimized implementation of
4086 // [some character set]*
4087 // This op scans through all matching input.
4088 // The following LOOP_C op emulates stack unwinding if the following pattern fails.
4089 {
4090 U_ASSERT(opValue > 0 && opValue < fSets->size());
4091 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
4092 UnicodeSet *s = (UnicodeSet *)fSets->elementAt(opValue);
4093
4094 // Loop through input, until either the input is exhausted or
4095 // we reach a character that is not a member of the set.
4096 int64_t ix = fp->fInputIdx;
4097 UTEXT_SETNATIVEINDEX(fInputText, ix);
4098 for (;;) {
4099 if (ix >= fActiveLimit) {
4100 fHitEnd = true;
4101 break;
4102 }
4103 UChar32 c = UTEXT_NEXT32(fInputText);
4104 if (c<256) {
4105 if (s8->contains(c) == false) {
4106 break;
4107 }
4108 } else {
4109 if (s->contains(c) == false) {
4110 break;
4111 }
4112 }
4113 ix = UTEXT_GETNATIVEINDEX(fInputText);
4114 }
4115
4116 // If there were no matching characters, skip over the loop altogether.
4117 // The loop doesn't run at all, a * op always succeeds.
4118 if (ix == fp->fInputIdx) {
4119 fp->fPatIdx++; // skip the URX_LOOP_C op.
4120 break;
4121 }
4122
4123 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
4124 // must follow. It's operand is the stack location
4125 // that holds the starting input index for the match of this [set]*
4126 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
4127 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
4128 int32_t stackLoc = URX_VAL(loopcOp);
4129 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
4130 fp->fExtra[stackLoc] = fp->fInputIdx;
4131 fp->fInputIdx = ix;
4132
4133 // Save State to the URX_LOOP_C op that follows this one,
4134 // so that match failures in the following code will return to there.
4135 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
4136 fp = StateSave(fp, fp->fPatIdx, status);
4137 fp->fPatIdx++;
4138 }
4139 break;
4140
4141
4142 case URX_LOOP_DOT_I:
4143 // Loop Initialization for the optimized implementation of .*
4144 // This op scans through all remaining input.
4145 // The following LOOP_C op emulates stack unwinding if the following pattern fails.
4146 {
4147 // Loop through input until the input is exhausted (we reach an end-of-line)
4148 // In DOTALL mode, we can just go straight to the end of the input.
4149 int64_t ix;
4150 if ((opValue & 1) == 1) {
4151 // Dot-matches-All mode. Jump straight to the end of the string.
4152 ix = fActiveLimit;
4153 fHitEnd = true;
4154 } else {
4155 // NOT DOT ALL mode. Line endings do not match '.'
4156 // Scan forward until a line ending or end of input.
4157 ix = fp->fInputIdx;
4158 UTEXT_SETNATIVEINDEX(fInputText, ix);
4159 for (;;) {
4160 if (ix >= fActiveLimit) {
4161 fHitEnd = true;
4162 break;
4163 }
4164 UChar32 c = UTEXT_NEXT32(fInputText);
4165 if ((c & 0x7f) <= 0x29) { // Fast filter of non-new-line-s
4166 if ((c == 0x0a) || // 0x0a is newline in both modes.
4167 (((opValue & 2) == 0) && // IF not UNIX_LINES mode
4168 isLineTerminator(c))) {
4169 // char is a line ending. Exit the scanning loop.
4170 break;
4171 }
4172 }
4173 ix = UTEXT_GETNATIVEINDEX(fInputText);
4174 }
4175 }
4176
4177 // If there were no matching characters, skip over the loop altogether.
4178 // The loop doesn't run at all, a * op always succeeds.
4179 if (ix == fp->fInputIdx) {
4180 fp->fPatIdx++; // skip the URX_LOOP_C op.
4181 break;
4182 }
4183
4184 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
4185 // must follow. It's operand is the stack location
4186 // that holds the starting input index for the match of this .*
4187 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
4188 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
4189 int32_t stackLoc = URX_VAL(loopcOp);
4190 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
4191 fp->fExtra[stackLoc] = fp->fInputIdx;
4192 fp->fInputIdx = ix;
4193
4194 // Save State to the URX_LOOP_C op that follows this one,
4195 // so that match failures in the following code will return to there.
4196 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
4197 fp = StateSave(fp, fp->fPatIdx, status);
4198 fp->fPatIdx++;
4199 }
4200 break;
4201
4202
4203 case URX_LOOP_C:
4204 {
4205 U_ASSERT(opValue>=0 && opValue<fFrameSize);
4206 backSearchIndex = fp->fExtra[opValue];
4207 U_ASSERT(backSearchIndex <= fp->fInputIdx);
4208 if (backSearchIndex == fp->fInputIdx) {
4209 // We've backed up the input idx to the point that the loop started.
4210 // The loop is done. Leave here without saving state.
4211 // Subsequent failures won't come back here.
4212 break;
4213 }
4214 // Set up for the next iteration of the loop, with input index
4215 // backed up by one from the last time through,
4216 // and a state save to this instruction in case the following code fails again.
4217 // (We're going backwards because this loop emulates stack unwinding, not
4218 // the initial scan forward.)
4219 U_ASSERT(fp->fInputIdx > 0);
4220 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
4221 UChar32 prevC = UTEXT_PREVIOUS32(fInputText);
4222 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
4223
4224 UChar32 twoPrevC = UTEXT_PREVIOUS32(fInputText);
4225 if (prevC == 0x0a &&
4226 fp->fInputIdx > backSearchIndex &&
4227 twoPrevC == 0x0d) {
4228 int32_t prevOp = (int32_t)pat[fp->fPatIdx-2];
4229 if (URX_TYPE(prevOp) == URX_LOOP_DOT_I) {
4230 // .*, stepping back over CRLF pair.
4231 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
4232 }
4233 }
4234
4235
4236 fp = StateSave(fp, fp->fPatIdx-1, status);
4237 }
4238 break;
4239
4240
4241
4242 default:
4243 // Trouble. The compiled pattern contains an entry with an
4244 // unrecognized type tag.
4245 UPRV_UNREACHABLE_ASSERT;
4246 // Unknown opcode type in opType = URX_TYPE(pat[fp->fPatIdx]). But we have
4247 // reports of this in production code, don't use UPRV_UNREACHABLE_EXIT.
4248 // See ICU-21669.
4249 status = U_INTERNAL_PROGRAM_ERROR;
4250 }
4251
4252 if (U_FAILURE(status)) {
4253 isMatch = false;
4254 break;
4255 }
4256 }
4257
4258 breakFromLoop:
4259 fMatch = isMatch;
4260 if (isMatch) {
4261 fLastMatchEnd = fMatchEnd;
4262 fMatchStart = startIdx;
4263 fMatchEnd = fp->fInputIdx;
4264 }
4265
4266 #ifdef REGEX_RUN_DEBUG
4267 if (fTraceDebug) {
4268 if (isMatch) {
4269 printf("Match. start=%ld end=%ld\n\n", fMatchStart, fMatchEnd);
4270 } else {
4271 printf("No match\n\n");
4272 }
4273 }
4274 #endif
4275
4276 fFrame = fp; // The active stack frame when the engine stopped.
4277 // Contains the capture group results that we need to
4278 // access later.
4279 }
4280
4281
4282 //--------------------------------------------------------------------------------
4283 //
4284 // MatchChunkAt This is the actual matching engine. Like MatchAt, but with the
4285 // assumption that the entire string is available in the UText's
4286 // chunk buffer. For now, that means we can use int32_t indexes,
4287 // except for anything that needs to be saved (like group starts
4288 // and ends).
4289 //
4290 // startIdx: begin matching a this index.
4291 // toEnd: if true, match must extend to end of the input region
4292 //
4293 //--------------------------------------------------------------------------------
MatchChunkAt(int32_t startIdx,UBool toEnd,UErrorCode & status)4294 void RegexMatcher::MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &status) {
4295 UBool isMatch = false; // True if the we have a match.
4296
4297 int32_t backSearchIndex = INT32_MAX; // used after greedy single-character matches for searching backwards
4298
4299 int32_t op; // Operation from the compiled pattern, split into
4300 int32_t opType; // the opcode
4301 int32_t opValue; // and the operand value.
4302
4303 #ifdef REGEX_RUN_DEBUG
4304 if (fTraceDebug) {
4305 printf("MatchAt(startIdx=%d)\n", startIdx);
4306 printf("Original Pattern: \"%s\"\n", CStr(StringFromUText(fPattern->fPattern))());
4307 printf("Input String: \"%s\"\n\n", CStr(StringFromUText(fInputText))());
4308 }
4309 #endif
4310
4311 if (U_FAILURE(status)) {
4312 return;
4313 }
4314
4315 // Cache frequently referenced items from the compiled pattern
4316 //
4317 int64_t *pat = fPattern->fCompiledPat->getBuffer();
4318
4319 const char16_t *litText = fPattern->fLiteralText.getBuffer();
4320 UVector *fSets = fPattern->fSets;
4321
4322 const char16_t *inputBuf = fInputText->chunkContents;
4323
4324 fFrameSize = fPattern->fFrameSize;
4325 REStackFrame *fp = resetStack();
4326 if (U_FAILURE(fDeferredStatus)) {
4327 status = fDeferredStatus;
4328 return;
4329 }
4330
4331 fp->fPatIdx = 0;
4332 fp->fInputIdx = startIdx;
4333
4334 // Zero out the pattern's static data
4335 int32_t i;
4336 for (i = 0; i<fPattern->fDataSize; i++) {
4337 fData[i] = 0;
4338 }
4339
4340 //
4341 // Main loop for interpreting the compiled pattern.
4342 // One iteration of the loop per pattern operation performed.
4343 //
4344 for (;;) {
4345 op = (int32_t)pat[fp->fPatIdx];
4346 opType = URX_TYPE(op);
4347 opValue = URX_VAL(op);
4348 #ifdef REGEX_RUN_DEBUG
4349 if (fTraceDebug) {
4350 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
4351 printf("inputIdx=%ld inputChar=%x sp=%3ld activeLimit=%ld ", fp->fInputIdx,
4352 UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(), fActiveLimit);
4353 fPattern->dumpOp(fp->fPatIdx);
4354 }
4355 #endif
4356 fp->fPatIdx++;
4357
4358 switch (opType) {
4359
4360
4361 case URX_NOP:
4362 break;
4363
4364
4365 case URX_BACKTRACK:
4366 // Force a backtrack. In some circumstances, the pattern compiler
4367 // will notice that the pattern can't possibly match anything, and will
4368 // emit one of these at that point.
4369 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4370 break;
4371
4372
4373 case URX_ONECHAR:
4374 if (fp->fInputIdx < fActiveLimit) {
4375 UChar32 c;
4376 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4377 if (c == opValue) {
4378 break;
4379 }
4380 } else {
4381 fHitEnd = true;
4382 }
4383 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4384 break;
4385
4386
4387 case URX_STRING:
4388 {
4389 // Test input against a literal string.
4390 // Strings require two slots in the compiled pattern, one for the
4391 // offset to the string text, and one for the length.
4392 int32_t stringStartIdx = opValue;
4393 int32_t stringLen;
4394
4395 op = (int32_t)pat[fp->fPatIdx]; // Fetch the second operand
4396 fp->fPatIdx++;
4397 opType = URX_TYPE(op);
4398 stringLen = URX_VAL(op);
4399 U_ASSERT(opType == URX_STRING_LEN);
4400 U_ASSERT(stringLen >= 2);
4401
4402 const char16_t * pInp = inputBuf + fp->fInputIdx;
4403 const char16_t * pInpLimit = inputBuf + fActiveLimit;
4404 const char16_t * pPat = litText+stringStartIdx;
4405 const char16_t * pEnd = pInp + stringLen;
4406 UBool success = true;
4407 while (pInp < pEnd) {
4408 if (pInp >= pInpLimit) {
4409 fHitEnd = true;
4410 success = false;
4411 break;
4412 }
4413 if (*pInp++ != *pPat++) {
4414 success = false;
4415 break;
4416 }
4417 }
4418
4419 if (success) {
4420 fp->fInputIdx += stringLen;
4421 } else {
4422 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4423 }
4424 }
4425 break;
4426
4427
4428 case URX_STATE_SAVE:
4429 fp = StateSave(fp, opValue, status);
4430 break;
4431
4432
4433 case URX_END:
4434 // The match loop will exit via this path on a successful match,
4435 // when we reach the end of the pattern.
4436 if (toEnd && fp->fInputIdx != fActiveLimit) {
4437 // The pattern matched, but not to the end of input. Try some more.
4438 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4439 break;
4440 }
4441 isMatch = true;
4442 goto breakFromLoop;
4443
4444 // Start and End Capture stack frame variables are laid out out like this:
4445 // fp->fExtra[opValue] - The start of a completed capture group
4446 // opValue+1 - The end of a completed capture group
4447 // opValue+2 - the start of a capture group whose end
4448 // has not yet been reached (and might not ever be).
4449 case URX_START_CAPTURE:
4450 U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
4451 fp->fExtra[opValue+2] = fp->fInputIdx;
4452 break;
4453
4454
4455 case URX_END_CAPTURE:
4456 U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
4457 U_ASSERT(fp->fExtra[opValue+2] >= 0); // Start pos for this group must be set.
4458 fp->fExtra[opValue] = fp->fExtra[opValue+2]; // Tentative start becomes real.
4459 fp->fExtra[opValue+1] = fp->fInputIdx; // End position
4460 U_ASSERT(fp->fExtra[opValue] <= fp->fExtra[opValue+1]);
4461 break;
4462
4463
4464 case URX_DOLLAR: // $, test for End of line
4465 // or for position before new line at end of input
4466 if (fp->fInputIdx < fAnchorLimit-2) {
4467 // We are no where near the end of input. Fail.
4468 // This is the common case. Keep it first.
4469 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4470 break;
4471 }
4472 if (fp->fInputIdx >= fAnchorLimit) {
4473 // We really are at the end of input. Success.
4474 fHitEnd = true;
4475 fRequireEnd = true;
4476 break;
4477 }
4478
4479 // If we are positioned just before a new-line that is located at the
4480 // end of input, succeed.
4481 if (fp->fInputIdx == fAnchorLimit-1) {
4482 UChar32 c;
4483 U16_GET(inputBuf, fAnchorStart, fp->fInputIdx, fAnchorLimit, c);
4484
4485 if (isLineTerminator(c)) {
4486 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp->fInputIdx-1]==0x0d)) {
4487 // At new-line at end of input. Success
4488 fHitEnd = true;
4489 fRequireEnd = true;
4490 break;
4491 }
4492 }
4493 } else if (fp->fInputIdx == fAnchorLimit-2 &&
4494 inputBuf[fp->fInputIdx]==0x0d && inputBuf[fp->fInputIdx+1]==0x0a) {
4495 fHitEnd = true;
4496 fRequireEnd = true;
4497 break; // At CR/LF at end of input. Success
4498 }
4499
4500 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4501
4502 break;
4503
4504
4505 case URX_DOLLAR_D: // $, test for End of Line, in UNIX_LINES mode.
4506 if (fp->fInputIdx >= fAnchorLimit-1) {
4507 // Either at the last character of input, or off the end.
4508 if (fp->fInputIdx == fAnchorLimit-1) {
4509 // At last char of input. Success if it's a new line.
4510 if (inputBuf[fp->fInputIdx] == 0x0a) {
4511 fHitEnd = true;
4512 fRequireEnd = true;
4513 break;
4514 }
4515 } else {
4516 // Off the end of input. Success.
4517 fHitEnd = true;
4518 fRequireEnd = true;
4519 break;
4520 }
4521 }
4522
4523 // Not at end of input. Back-track out.
4524 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4525 break;
4526
4527
4528 case URX_DOLLAR_M: // $, test for End of line in multi-line mode
4529 {
4530 if (fp->fInputIdx >= fAnchorLimit) {
4531 // We really are at the end of input. Success.
4532 fHitEnd = true;
4533 fRequireEnd = true;
4534 break;
4535 }
4536 // If we are positioned just before a new-line, succeed.
4537 // It makes no difference where the new-line is within the input.
4538 UChar32 c = inputBuf[fp->fInputIdx];
4539 if (isLineTerminator(c)) {
4540 // At a line end, except for the odd chance of being in the middle of a CR/LF sequence
4541 // In multi-line mode, hitting a new-line just before the end of input does not
4542 // set the hitEnd or requireEnd flags
4543 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp->fInputIdx-1]==0x0d)) {
4544 break;
4545 }
4546 }
4547 // not at a new line. Fail.
4548 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4549 }
4550 break;
4551
4552
4553 case URX_DOLLAR_MD: // $, test for End of line in multi-line and UNIX_LINES mode
4554 {
4555 if (fp->fInputIdx >= fAnchorLimit) {
4556 // We really are at the end of input. Success.
4557 fHitEnd = true;
4558 fRequireEnd = true; // Java set requireEnd in this case, even though
4559 break; // adding a new-line would not lose the match.
4560 }
4561 // If we are not positioned just before a new-line, the test fails; backtrack out.
4562 // It makes no difference where the new-line is within the input.
4563 if (inputBuf[fp->fInputIdx] != 0x0a) {
4564 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4565 }
4566 }
4567 break;
4568
4569
4570 case URX_CARET: // ^, test for start of line
4571 if (fp->fInputIdx != fAnchorStart) {
4572 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4573 }
4574 break;
4575
4576
4577 case URX_CARET_M: // ^, test for start of line in mulit-line mode
4578 {
4579 if (fp->fInputIdx == fAnchorStart) {
4580 // We are at the start input. Success.
4581 break;
4582 }
4583 // Check whether character just before the current pos is a new-line
4584 // unless we are at the end of input
4585 char16_t c = inputBuf[fp->fInputIdx - 1];
4586 if ((fp->fInputIdx < fAnchorLimit) &&
4587 isLineTerminator(c)) {
4588 // It's a new-line. ^ is true. Success.
4589 // TODO: what should be done with positions between a CR and LF?
4590 break;
4591 }
4592 // Not at the start of a line. Fail.
4593 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4594 }
4595 break;
4596
4597
4598 case URX_CARET_M_UNIX: // ^, test for start of line in mulit-line + Unix-line mode
4599 {
4600 U_ASSERT(fp->fInputIdx >= fAnchorStart);
4601 if (fp->fInputIdx <= fAnchorStart) {
4602 // We are at the start input. Success.
4603 break;
4604 }
4605 // Check whether character just before the current pos is a new-line
4606 U_ASSERT(fp->fInputIdx <= fAnchorLimit);
4607 char16_t c = inputBuf[fp->fInputIdx - 1];
4608 if (c != 0x0a) {
4609 // Not at the start of a line. Back-track out.
4610 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4611 }
4612 }
4613 break;
4614
4615 case URX_BACKSLASH_B: // Test for word boundaries
4616 {
4617 UBool success = isChunkWordBoundary((int32_t)fp->fInputIdx);
4618 success ^= (UBool)(opValue != 0); // flip sense for \B
4619 if (!success) {
4620 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4621 }
4622 }
4623 break;
4624
4625
4626 case URX_BACKSLASH_BU: // Test for word boundaries, Unicode-style
4627 {
4628 UBool success = isUWordBoundary(fp->fInputIdx, status);
4629 success ^= (UBool)(opValue != 0); // flip sense for \B
4630 if (!success) {
4631 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4632 }
4633 }
4634 break;
4635
4636
4637 case URX_BACKSLASH_D: // Test for decimal digit
4638 {
4639 if (fp->fInputIdx >= fActiveLimit) {
4640 fHitEnd = true;
4641 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4642 break;
4643 }
4644
4645 UChar32 c;
4646 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4647 int8_t ctype = u_charType(c); // TODO: make a unicode set for this. Will be faster.
4648 UBool success = (ctype == U_DECIMAL_DIGIT_NUMBER);
4649 success ^= (UBool)(opValue != 0); // flip sense for \D
4650 if (!success) {
4651 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4652 }
4653 }
4654 break;
4655
4656
4657 case URX_BACKSLASH_G: // Test for position at end of previous match
4658 if (!((fMatch && fp->fInputIdx==fMatchEnd) || (fMatch==false && fp->fInputIdx==fActiveStart))) {
4659 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4660 }
4661 break;
4662
4663
4664 case URX_BACKSLASH_H: // Test for \h, horizontal white space.
4665 {
4666 if (fp->fInputIdx >= fActiveLimit) {
4667 fHitEnd = true;
4668 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4669 break;
4670 }
4671 UChar32 c;
4672 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4673 int8_t ctype = u_charType(c);
4674 UBool success = (ctype == U_SPACE_SEPARATOR || c == 9); // SPACE_SEPARATOR || TAB
4675 success ^= (UBool)(opValue != 0); // flip sense for \H
4676 if (!success) {
4677 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4678 }
4679 }
4680 break;
4681
4682
4683 case URX_BACKSLASH_R: // Test for \R, any line break sequence.
4684 {
4685 if (fp->fInputIdx >= fActiveLimit) {
4686 fHitEnd = true;
4687 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4688 break;
4689 }
4690 UChar32 c;
4691 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4692 if (isLineTerminator(c)) {
4693 if (c == 0x0d && fp->fInputIdx < fActiveLimit) {
4694 // Check for CR/LF sequence. Consume both together when found.
4695 char16_t c2;
4696 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c2);
4697 if (c2 != 0x0a) {
4698 U16_PREV(inputBuf, 0, fp->fInputIdx, c2);
4699 }
4700 }
4701 } else {
4702 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4703 }
4704 }
4705 break;
4706
4707
4708 case URX_BACKSLASH_V: // Any single code point line ending.
4709 {
4710 if (fp->fInputIdx >= fActiveLimit) {
4711 fHitEnd = true;
4712 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4713 break;
4714 }
4715 UChar32 c;
4716 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4717 UBool success = isLineTerminator(c);
4718 success ^= (UBool)(opValue != 0); // flip sense for \V
4719 if (!success) {
4720 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4721 }
4722 }
4723 break;
4724
4725
4726 case URX_BACKSLASH_X:
4727 // Match a Grapheme, as defined by Unicode UAX 29.
4728
4729 // Fail if at end of input
4730 if (fp->fInputIdx >= fActiveLimit) {
4731 fHitEnd = true;
4732 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4733 break;
4734 }
4735
4736 fp->fInputIdx = followingGCBoundary(fp->fInputIdx, status);
4737 if (fp->fInputIdx >= fActiveLimit) {
4738 fHitEnd = true;
4739 fp->fInputIdx = fActiveLimit;
4740 }
4741 break;
4742
4743
4744 case URX_BACKSLASH_Z: // Test for end of Input
4745 if (fp->fInputIdx < fAnchorLimit) {
4746 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4747 } else {
4748 fHitEnd = true;
4749 fRequireEnd = true;
4750 }
4751 break;
4752
4753
4754
4755 case URX_STATIC_SETREF:
4756 {
4757 // Test input character against one of the predefined sets
4758 // (Word Characters, for example)
4759 // The high bit of the op value is a flag for the match polarity.
4760 // 0: success if input char is in set.
4761 // 1: success if input char is not in set.
4762 if (fp->fInputIdx >= fActiveLimit) {
4763 fHitEnd = true;
4764 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4765 break;
4766 }
4767
4768 UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET);
4769 opValue &= ~URX_NEG_SET;
4770 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
4771
4772 UChar32 c;
4773 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4774 if (c < 256) {
4775 Regex8BitSet &s8 = RegexStaticSets::gStaticSets->fPropSets8[opValue];
4776 if (s8.contains(c)) {
4777 success = !success;
4778 }
4779 } else {
4780 const UnicodeSet &s = RegexStaticSets::gStaticSets->fPropSets[opValue];
4781 if (s.contains(c)) {
4782 success = !success;
4783 }
4784 }
4785 if (!success) {
4786 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4787 }
4788 }
4789 break;
4790
4791
4792 case URX_STAT_SETREF_N:
4793 {
4794 // Test input character for NOT being a member of one of
4795 // the predefined sets (Word Characters, for example)
4796 if (fp->fInputIdx >= fActiveLimit) {
4797 fHitEnd = true;
4798 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4799 break;
4800 }
4801
4802 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
4803
4804 UChar32 c;
4805 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4806 if (c < 256) {
4807 Regex8BitSet &s8 = RegexStaticSets::gStaticSets->fPropSets8[opValue];
4808 if (s8.contains(c) == false) {
4809 break;
4810 }
4811 } else {
4812 const UnicodeSet &s = RegexStaticSets::gStaticSets->fPropSets[opValue];
4813 if (s.contains(c) == false) {
4814 break;
4815 }
4816 }
4817 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4818 }
4819 break;
4820
4821
4822 case URX_SETREF:
4823 {
4824 if (fp->fInputIdx >= fActiveLimit) {
4825 fHitEnd = true;
4826 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4827 break;
4828 }
4829
4830 U_ASSERT(opValue > 0 && opValue < fSets->size());
4831
4832 // There is input left. Pick up one char and test it for set membership.
4833 UChar32 c;
4834 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4835 if (c<256) {
4836 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
4837 if (s8->contains(c)) {
4838 // The character is in the set. A Match.
4839 break;
4840 }
4841 } else {
4842 UnicodeSet *s = (UnicodeSet *)fSets->elementAt(opValue);
4843 if (s->contains(c)) {
4844 // The character is in the set. A Match.
4845 break;
4846 }
4847 }
4848
4849 // the character wasn't in the set.
4850 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4851 }
4852 break;
4853
4854
4855 case URX_DOTANY:
4856 {
4857 // . matches anything, but stops at end-of-line.
4858 if (fp->fInputIdx >= fActiveLimit) {
4859 // At end of input. Match failed. Backtrack out.
4860 fHitEnd = true;
4861 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4862 break;
4863 }
4864
4865 // There is input left. Advance over one char, unless we've hit end-of-line
4866 UChar32 c;
4867 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4868 if (isLineTerminator(c)) {
4869 // End of line in normal mode. . does not match.
4870 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4871 break;
4872 }
4873 }
4874 break;
4875
4876
4877 case URX_DOTANY_ALL:
4878 {
4879 // . in dot-matches-all (including new lines) mode
4880 if (fp->fInputIdx >= fActiveLimit) {
4881 // At end of input. Match failed. Backtrack out.
4882 fHitEnd = true;
4883 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4884 break;
4885 }
4886
4887 // There is input left. Advance over one char, except if we are
4888 // at a cr/lf, advance over both of them.
4889 UChar32 c;
4890 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4891 if (c==0x0d && fp->fInputIdx < fActiveLimit) {
4892 // In the case of a CR/LF, we need to advance over both.
4893 if (inputBuf[fp->fInputIdx] == 0x0a) {
4894 U16_FWD_1(inputBuf, fp->fInputIdx, fActiveLimit);
4895 }
4896 }
4897 }
4898 break;
4899
4900
4901 case URX_DOTANY_UNIX:
4902 {
4903 // '.' operator, matches all, but stops at end-of-line.
4904 // UNIX_LINES mode, so 0x0a is the only recognized line ending.
4905 if (fp->fInputIdx >= fActiveLimit) {
4906 // At end of input. Match failed. Backtrack out.
4907 fHitEnd = true;
4908 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4909 break;
4910 }
4911
4912 // There is input left. Advance over one char, unless we've hit end-of-line
4913 UChar32 c;
4914 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4915 if (c == 0x0a) {
4916 // End of line in normal mode. '.' does not match the \n
4917 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4918 }
4919 }
4920 break;
4921
4922
4923 case URX_JMP:
4924 fp->fPatIdx = opValue;
4925 break;
4926
4927 case URX_FAIL:
4928 isMatch = false;
4929 goto breakFromLoop;
4930
4931 case URX_JMP_SAV:
4932 U_ASSERT(opValue < fPattern->fCompiledPat->size());
4933 fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current
4934 fp->fPatIdx = opValue; // Then JMP.
4935 break;
4936
4937 case URX_JMP_SAV_X:
4938 // This opcode is used with (x)+, when x can match a zero length string.
4939 // Same as JMP_SAV, except conditional on the match having made forward progress.
4940 // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the
4941 // data address of the input position at the start of the loop.
4942 {
4943 U_ASSERT(opValue > 0 && opValue < fPattern->fCompiledPat->size());
4944 int32_t stoOp = (int32_t)pat[opValue-1];
4945 U_ASSERT(URX_TYPE(stoOp) == URX_STO_INP_LOC);
4946 int32_t frameLoc = URX_VAL(stoOp);
4947 U_ASSERT(frameLoc >= 0 && frameLoc < fFrameSize);
4948 int32_t prevInputIdx = (int32_t)fp->fExtra[frameLoc];
4949 U_ASSERT(prevInputIdx <= fp->fInputIdx);
4950 if (prevInputIdx < fp->fInputIdx) {
4951 // The match did make progress. Repeat the loop.
4952 fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current
4953 fp->fPatIdx = opValue;
4954 fp->fExtra[frameLoc] = fp->fInputIdx;
4955 }
4956 // If the input position did not advance, we do nothing here,
4957 // execution will fall out of the loop.
4958 }
4959 break;
4960
4961 case URX_CTR_INIT:
4962 {
4963 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
4964 fp->fExtra[opValue] = 0; // Set the loop counter variable to zero
4965
4966 // Pick up the three extra operands that CTR_INIT has, and
4967 // skip the pattern location counter past
4968 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
4969 fp->fPatIdx += 3;
4970 int32_t loopLoc = URX_VAL(pat[instrOperandLoc]);
4971 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
4972 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
4973 U_ASSERT(minCount>=0);
4974 U_ASSERT(maxCount>=minCount || maxCount==-1);
4975 U_ASSERT(loopLoc>=fp->fPatIdx);
4976
4977 if (minCount == 0) {
4978 fp = StateSave(fp, loopLoc+1, status);
4979 }
4980 if (maxCount == -1) {
4981 fp->fExtra[opValue+1] = fp->fInputIdx; // For loop breaking.
4982 } else if (maxCount == 0) {
4983 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4984 }
4985 }
4986 break;
4987
4988 case URX_CTR_LOOP:
4989 {
4990 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
4991 int32_t initOp = (int32_t)pat[opValue];
4992 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT);
4993 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
4994 int32_t minCount = (int32_t)pat[opValue+2];
4995 int32_t maxCount = (int32_t)pat[opValue+3];
4996 (*pCounter)++;
4997 if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) {
4998 U_ASSERT(*pCounter == maxCount);
4999 break;
5000 }
5001 if (*pCounter >= minCount) {
5002 if (maxCount == -1) {
5003 // Loop has no hard upper bound.
5004 // Check that it is progressing through the input, break if it is not.
5005 int64_t *pLastInputIdx = &fp->fExtra[URX_VAL(initOp) + 1];
5006 if (fp->fInputIdx == *pLastInputIdx) {
5007 break;
5008 } else {
5009 *pLastInputIdx = fp->fInputIdx;
5010 }
5011 }
5012 fp = StateSave(fp, fp->fPatIdx, status);
5013 } else {
5014 // Increment time-out counter. (StateSave() does it if count >= minCount)
5015 fTickCounter--;
5016 if (fTickCounter <= 0) {
5017 IncrementTime(status); // Re-initializes fTickCounter
5018 }
5019 }
5020 fp->fPatIdx = opValue + 4; // Loop back.
5021 }
5022 break;
5023
5024 case URX_CTR_INIT_NG:
5025 {
5026 // Initialize a non-greedy loop
5027 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
5028 fp->fExtra[opValue] = 0; // Set the loop counter variable to zero
5029
5030 // Pick up the three extra operands that CTR_INIT_NG has, and
5031 // skip the pattern location counter past
5032 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
5033 fp->fPatIdx += 3;
5034 int32_t loopLoc = URX_VAL(pat[instrOperandLoc]);
5035 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
5036 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
5037 U_ASSERT(minCount>=0);
5038 U_ASSERT(maxCount>=minCount || maxCount==-1);
5039 U_ASSERT(loopLoc>fp->fPatIdx);
5040 if (maxCount == -1) {
5041 fp->fExtra[opValue+1] = fp->fInputIdx; // Save initial input index for loop breaking.
5042 }
5043
5044 if (minCount == 0) {
5045 if (maxCount != 0) {
5046 fp = StateSave(fp, fp->fPatIdx, status);
5047 }
5048 fp->fPatIdx = loopLoc+1; // Continue with stuff after repeated block
5049 }
5050 }
5051 break;
5052
5053 case URX_CTR_LOOP_NG:
5054 {
5055 // Non-greedy {min, max} loops
5056 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
5057 int32_t initOp = (int32_t)pat[opValue];
5058 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT_NG);
5059 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
5060 int32_t minCount = (int32_t)pat[opValue+2];
5061 int32_t maxCount = (int32_t)pat[opValue+3];
5062
5063 (*pCounter)++;
5064 if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) {
5065 // The loop has matched the maximum permitted number of times.
5066 // Break out of here with no action. Matching will
5067 // continue with the following pattern.
5068 U_ASSERT(*pCounter == maxCount);
5069 break;
5070 }
5071
5072 if (*pCounter < minCount) {
5073 // We haven't met the minimum number of matches yet.
5074 // Loop back for another one.
5075 fp->fPatIdx = opValue + 4; // Loop back.
5076 fTickCounter--;
5077 if (fTickCounter <= 0) {
5078 IncrementTime(status); // Re-initializes fTickCounter
5079 }
5080 } else {
5081 // We do have the minimum number of matches.
5082
5083 // If there is no upper bound on the loop iterations, check that the input index
5084 // is progressing, and stop the loop if it is not.
5085 if (maxCount == -1) {
5086 int64_t *pLastInputIdx = &fp->fExtra[URX_VAL(initOp) + 1];
5087 if (fp->fInputIdx == *pLastInputIdx) {
5088 break;
5089 }
5090 *pLastInputIdx = fp->fInputIdx;
5091 }
5092
5093 // Loop Continuation: we will fall into the pattern following the loop
5094 // (non-greedy, don't execute loop body first), but first do
5095 // a state save to the top of the loop, so that a match failure
5096 // in the following pattern will try another iteration of the loop.
5097 fp = StateSave(fp, opValue + 4, status);
5098 }
5099 }
5100 break;
5101
5102 case URX_STO_SP:
5103 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
5104 fData[opValue] = fStack->size();
5105 break;
5106
5107 case URX_LD_SP:
5108 {
5109 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
5110 int32_t newStackSize = (int32_t)fData[opValue];
5111 U_ASSERT(newStackSize <= fStack->size());
5112 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
5113 if (newFP == (int64_t *)fp) {
5114 break;
5115 }
5116 int32_t j;
5117 for (j=0; j<fFrameSize; j++) {
5118 newFP[j] = ((int64_t *)fp)[j];
5119 }
5120 fp = (REStackFrame *)newFP;
5121 fStack->setSize(newStackSize);
5122 }
5123 break;
5124
5125 case URX_BACKREF:
5126 {
5127 U_ASSERT(opValue < fFrameSize);
5128 int64_t groupStartIdx = fp->fExtra[opValue];
5129 int64_t groupEndIdx = fp->fExtra[opValue+1];
5130 U_ASSERT(groupStartIdx <= groupEndIdx);
5131 int64_t inputIndex = fp->fInputIdx;
5132 if (groupStartIdx < 0) {
5133 // This capture group has not participated in the match thus far,
5134 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match.
5135 break;
5136 }
5137 UBool success = true;
5138 for (int64_t groupIndex = groupStartIdx; groupIndex < groupEndIdx; ++groupIndex,++inputIndex) {
5139 if (inputIndex >= fActiveLimit) {
5140 success = false;
5141 fHitEnd = true;
5142 break;
5143 }
5144 if (inputBuf[groupIndex] != inputBuf[inputIndex]) {
5145 success = false;
5146 break;
5147 }
5148 }
5149 if (success && groupStartIdx < groupEndIdx && U16_IS_LEAD(inputBuf[groupEndIdx-1]) &&
5150 inputIndex < fActiveLimit && U16_IS_TRAIL(inputBuf[inputIndex])) {
5151 // Capture group ended with an unpaired lead surrogate.
5152 // Back reference is not permitted to match lead only of a surrogatge pair.
5153 success = false;
5154 }
5155 if (success) {
5156 fp->fInputIdx = inputIndex;
5157 } else {
5158 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5159 }
5160 }
5161 break;
5162
5163 case URX_BACKREF_I:
5164 {
5165 U_ASSERT(opValue < fFrameSize);
5166 int64_t groupStartIdx = fp->fExtra[opValue];
5167 int64_t groupEndIdx = fp->fExtra[opValue+1];
5168 U_ASSERT(groupStartIdx <= groupEndIdx);
5169 if (groupStartIdx < 0) {
5170 // This capture group has not participated in the match thus far,
5171 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match.
5172 break;
5173 }
5174 CaseFoldingUCharIterator captureGroupItr(inputBuf, groupStartIdx, groupEndIdx);
5175 CaseFoldingUCharIterator inputItr(inputBuf, fp->fInputIdx, fActiveLimit);
5176
5177 // Note: if the capture group match was of an empty string the backref
5178 // match succeeds. Verified by testing: Perl matches succeed
5179 // in this case, so we do too.
5180
5181 UBool success = true;
5182 for (;;) {
5183 UChar32 captureGroupChar = captureGroupItr.next();
5184 if (captureGroupChar == U_SENTINEL) {
5185 success = true;
5186 break;
5187 }
5188 UChar32 inputChar = inputItr.next();
5189 if (inputChar == U_SENTINEL) {
5190 success = false;
5191 fHitEnd = true;
5192 break;
5193 }
5194 if (inputChar != captureGroupChar) {
5195 success = false;
5196 break;
5197 }
5198 }
5199
5200 if (success && inputItr.inExpansion()) {
5201 // We obtained a match by consuming part of a string obtained from
5202 // case-folding a single code point of the input text.
5203 // This does not count as an overall match.
5204 success = false;
5205 }
5206
5207 if (success) {
5208 fp->fInputIdx = inputItr.getIndex();
5209 } else {
5210 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5211 }
5212 }
5213 break;
5214
5215 case URX_STO_INP_LOC:
5216 {
5217 U_ASSERT(opValue >= 0 && opValue < fFrameSize);
5218 fp->fExtra[opValue] = fp->fInputIdx;
5219 }
5220 break;
5221
5222 case URX_JMPX:
5223 {
5224 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
5225 fp->fPatIdx += 1;
5226 int32_t dataLoc = URX_VAL(pat[instrOperandLoc]);
5227 U_ASSERT(dataLoc >= 0 && dataLoc < fFrameSize);
5228 int32_t savedInputIdx = (int32_t)fp->fExtra[dataLoc];
5229 U_ASSERT(savedInputIdx <= fp->fInputIdx);
5230 if (savedInputIdx < fp->fInputIdx) {
5231 fp->fPatIdx = opValue; // JMP
5232 } else {
5233 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no progress in loop.
5234 }
5235 }
5236 break;
5237
5238 case URX_LA_START:
5239 {
5240 // Entering a look around block.
5241 // Save Stack Ptr, Input Pos.
5242 U_ASSERT(opValue>=0 && opValue+3<fPattern->fDataSize);
5243 fData[opValue] = fStack->size();
5244 fData[opValue+1] = fp->fInputIdx;
5245 fData[opValue+2] = fActiveStart;
5246 fData[opValue+3] = fActiveLimit;
5247 fActiveStart = fLookStart; // Set the match region change for
5248 fActiveLimit = fLookLimit; // transparent bounds.
5249 }
5250 break;
5251
5252 case URX_LA_END:
5253 {
5254 // Leaving a look around block.
5255 // restore Stack Ptr, Input Pos to positions they had on entry to block.
5256 U_ASSERT(opValue>=0 && opValue+3<fPattern->fDataSize);
5257 int32_t stackSize = fStack->size();
5258 int32_t newStackSize = (int32_t)fData[opValue];
5259 U_ASSERT(stackSize >= newStackSize);
5260 if (stackSize > newStackSize) {
5261 // Copy the current top frame back to the new (cut back) top frame.
5262 // This makes the capture groups from within the look-ahead
5263 // expression available.
5264 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
5265 int32_t j;
5266 for (j=0; j<fFrameSize; j++) {
5267 newFP[j] = ((int64_t *)fp)[j];
5268 }
5269 fp = (REStackFrame *)newFP;
5270 fStack->setSize(newStackSize);
5271 }
5272 fp->fInputIdx = fData[opValue+1];
5273
5274 // Restore the active region bounds in the input string; they may have
5275 // been changed because of transparent bounds on a Region.
5276 fActiveStart = fData[opValue+2];
5277 fActiveLimit = fData[opValue+3];
5278 U_ASSERT(fActiveStart >= 0);
5279 U_ASSERT(fActiveLimit <= fInputLength);
5280 }
5281 break;
5282
5283 case URX_ONECHAR_I:
5284 if (fp->fInputIdx < fActiveLimit) {
5285 UChar32 c;
5286 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
5287 if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) {
5288 break;
5289 }
5290 } else {
5291 fHitEnd = true;
5292 }
5293 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5294 break;
5295
5296 case URX_STRING_I:
5297 // Case-insensitive test input against a literal string.
5298 // Strings require two slots in the compiled pattern, one for the
5299 // offset to the string text, and one for the length.
5300 // The compiled string has already been case folded.
5301 {
5302 const char16_t *patternString = litText + opValue;
5303
5304 op = (int32_t)pat[fp->fPatIdx];
5305 fp->fPatIdx++;
5306 opType = URX_TYPE(op);
5307 opValue = URX_VAL(op);
5308 U_ASSERT(opType == URX_STRING_LEN);
5309 int32_t patternStringLen = opValue; // Length of the string from the pattern.
5310
5311 UChar32 cText;
5312 UChar32 cPattern;
5313 UBool success = true;
5314 int32_t patternStringIdx = 0;
5315 CaseFoldingUCharIterator inputIterator(inputBuf, fp->fInputIdx, fActiveLimit);
5316 while (patternStringIdx < patternStringLen) {
5317 U16_NEXT(patternString, patternStringIdx, patternStringLen, cPattern);
5318 cText = inputIterator.next();
5319 if (cText != cPattern) {
5320 success = false;
5321 if (cText == U_SENTINEL) {
5322 fHitEnd = true;
5323 }
5324 break;
5325 }
5326 }
5327 if (inputIterator.inExpansion()) {
5328 success = false;
5329 }
5330
5331 if (success) {
5332 fp->fInputIdx = inputIterator.getIndex();
5333 } else {
5334 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5335 }
5336 }
5337 break;
5338
5339 case URX_LB_START:
5340 {
5341 // Entering a look-behind block.
5342 // Save Stack Ptr, Input Pos and active input region.
5343 // TODO: implement transparent bounds. Ticket #6067
5344 U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize);
5345 fData[opValue] = fStack->size();
5346 fData[opValue+1] = fp->fInputIdx;
5347 // Save input string length, then reset to pin any matches to end at
5348 // the current position.
5349 fData[opValue+2] = fActiveStart;
5350 fData[opValue+3] = fActiveLimit;
5351 fActiveStart = fRegionStart;
5352 fActiveLimit = fp->fInputIdx;
5353 // Init the variable containing the start index for attempted matches.
5354 fData[opValue+4] = -1;
5355 }
5356 break;
5357
5358
5359 case URX_LB_CONT:
5360 {
5361 // Positive Look-Behind, at top of loop checking for matches of LB expression
5362 // at all possible input starting positions.
5363
5364 // Fetch the min and max possible match lengths. They are the operands
5365 // of this op in the pattern.
5366 int32_t minML = (int32_t)pat[fp->fPatIdx++];
5367 int32_t maxML = (int32_t)pat[fp->fPatIdx++];
5368 U_ASSERT(minML <= maxML);
5369 U_ASSERT(minML >= 0);
5370
5371 // Fetch (from data) the last input index where a match was attempted.
5372 U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize);
5373 int64_t &lbStartIdx = fData[opValue+4];
5374 if (lbStartIdx < 0) {
5375 // First time through loop.
5376 lbStartIdx = fp->fInputIdx - minML;
5377 if (lbStartIdx > 0 && lbStartIdx < fInputLength) {
5378 U16_SET_CP_START(inputBuf, 0, lbStartIdx);
5379 }
5380 } else {
5381 // 2nd through nth time through the loop.
5382 // Back up start position for match by one.
5383 if (lbStartIdx == 0) {
5384 lbStartIdx--;
5385 } else {
5386 U16_BACK_1(inputBuf, 0, lbStartIdx);
5387 }
5388 }
5389
5390 if (lbStartIdx < 0 || lbStartIdx < fp->fInputIdx - maxML) {
5391 // We have tried all potential match starting points without
5392 // getting a match. Backtrack out, and out of the
5393 // Look Behind altogether.
5394 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5395 fActiveStart = fData[opValue+2];
5396 fActiveLimit = fData[opValue+3];
5397 U_ASSERT(fActiveStart >= 0);
5398 U_ASSERT(fActiveLimit <= fInputLength);
5399 break;
5400 }
5401
5402 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
5403 // (successful match will fall off the end of the loop.)
5404 fp = StateSave(fp, fp->fPatIdx-3, status);
5405 fp->fInputIdx = lbStartIdx;
5406 }
5407 break;
5408
5409 case URX_LB_END:
5410 // End of a look-behind block, after a successful match.
5411 {
5412 U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize);
5413 if (fp->fInputIdx != fActiveLimit) {
5414 // The look-behind expression matched, but the match did not
5415 // extend all the way to the point that we are looking behind from.
5416 // FAIL out of here, which will take us back to the LB_CONT, which
5417 // will retry the match starting at another position or fail
5418 // the look-behind altogether, whichever is appropriate.
5419 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5420 break;
5421 }
5422
5423 // Look-behind match is good. Restore the original input string region,
5424 // which had been truncated to pin the end of the lookbehind match to the
5425 // position being looked-behind.
5426 fActiveStart = fData[opValue+2];
5427 fActiveLimit = fData[opValue+3];
5428 U_ASSERT(fActiveStart >= 0);
5429 U_ASSERT(fActiveLimit <= fInputLength);
5430 }
5431 break;
5432
5433
5434 case URX_LBN_CONT:
5435 {
5436 // Negative Look-Behind, at top of loop checking for matches of LB expression
5437 // at all possible input starting positions.
5438
5439 // Fetch the extra parameters of this op.
5440 int32_t minML = (int32_t)pat[fp->fPatIdx++];
5441 int32_t maxML = (int32_t)pat[fp->fPatIdx++];
5442 int32_t continueLoc = (int32_t)pat[fp->fPatIdx++];
5443 continueLoc = URX_VAL(continueLoc);
5444 U_ASSERT(minML <= maxML);
5445 U_ASSERT(minML >= 0);
5446 U_ASSERT(continueLoc > fp->fPatIdx);
5447
5448 // Fetch (from data) the last input index where a match was attempted.
5449 U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize);
5450 int64_t &lbStartIdx = fData[opValue+4];
5451 if (lbStartIdx < 0) {
5452 // First time through loop.
5453 lbStartIdx = fp->fInputIdx - minML;
5454 if (lbStartIdx > 0 && lbStartIdx < fInputLength) {
5455 U16_SET_CP_START(inputBuf, 0, lbStartIdx);
5456 }
5457 } else {
5458 // 2nd through nth time through the loop.
5459 // Back up start position for match by one.
5460 if (lbStartIdx == 0) {
5461 lbStartIdx--; // Because U16_BACK is unsafe starting at 0.
5462 } else {
5463 U16_BACK_1(inputBuf, 0, lbStartIdx);
5464 }
5465 }
5466
5467 if (lbStartIdx < 0 || lbStartIdx < fp->fInputIdx - maxML) {
5468 // We have tried all potential match starting points without
5469 // getting a match, which means that the negative lookbehind as
5470 // a whole has succeeded. Jump forward to the continue location
5471 fActiveStart = fData[opValue+2];
5472 fActiveLimit = fData[opValue+3];
5473 U_ASSERT(fActiveStart >= 0);
5474 U_ASSERT(fActiveLimit <= fInputLength);
5475 fp->fPatIdx = continueLoc;
5476 break;
5477 }
5478
5479 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
5480 // (successful match will cause a FAIL out of the loop altogether.)
5481 fp = StateSave(fp, fp->fPatIdx-4, status);
5482 fp->fInputIdx = lbStartIdx;
5483 }
5484 break;
5485
5486 case URX_LBN_END:
5487 // End of a negative look-behind block, after a successful match.
5488 {
5489 U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize);
5490 if (fp->fInputIdx != fActiveLimit) {
5491 // The look-behind expression matched, but the match did not
5492 // extend all the way to the point that we are looking behind from.
5493 // FAIL out of here, which will take us back to the LB_CONT, which
5494 // will retry the match starting at another position or succeed
5495 // the look-behind altogether, whichever is appropriate.
5496 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5497 break;
5498 }
5499
5500 // Look-behind expression matched, which means look-behind test as
5501 // a whole Fails
5502
5503 // Restore the original input string length, which had been truncated
5504 // inorder to pin the end of the lookbehind match
5505 // to the position being looked-behind.
5506 fActiveStart = fData[opValue+2];
5507 fActiveLimit = fData[opValue+3];
5508 U_ASSERT(fActiveStart >= 0);
5509 U_ASSERT(fActiveLimit <= fInputLength);
5510
5511 // Restore original stack position, discarding any state saved
5512 // by the successful pattern match.
5513 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5514 int32_t newStackSize = (int32_t)fData[opValue];
5515 U_ASSERT(fStack->size() > newStackSize);
5516 fStack->setSize(newStackSize);
5517
5518 // FAIL, which will take control back to someplace
5519 // prior to entering the look-behind test.
5520 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5521 }
5522 break;
5523
5524
5525 case URX_LOOP_SR_I:
5526 // Loop Initialization for the optimized implementation of
5527 // [some character set]*
5528 // This op scans through all matching input.
5529 // The following LOOP_C op emulates stack unwinding if the following pattern fails.
5530 {
5531 U_ASSERT(opValue > 0 && opValue < fSets->size());
5532 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
5533 UnicodeSet *s = (UnicodeSet *)fSets->elementAt(opValue);
5534
5535 // Loop through input, until either the input is exhausted or
5536 // we reach a character that is not a member of the set.
5537 int32_t ix = (int32_t)fp->fInputIdx;
5538 for (;;) {
5539 if (ix >= fActiveLimit) {
5540 fHitEnd = true;
5541 break;
5542 }
5543 UChar32 c;
5544 U16_NEXT(inputBuf, ix, fActiveLimit, c);
5545 if (c<256) {
5546 if (s8->contains(c) == false) {
5547 U16_BACK_1(inputBuf, 0, ix);
5548 break;
5549 }
5550 } else {
5551 if (s->contains(c) == false) {
5552 U16_BACK_1(inputBuf, 0, ix);
5553 break;
5554 }
5555 }
5556 }
5557
5558 // If there were no matching characters, skip over the loop altogether.
5559 // The loop doesn't run at all, a * op always succeeds.
5560 if (ix == fp->fInputIdx) {
5561 fp->fPatIdx++; // skip the URX_LOOP_C op.
5562 break;
5563 }
5564
5565 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
5566 // must follow. It's operand is the stack location
5567 // that holds the starting input index for the match of this [set]*
5568 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
5569 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
5570 int32_t stackLoc = URX_VAL(loopcOp);
5571 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
5572 fp->fExtra[stackLoc] = fp->fInputIdx;
5573 fp->fInputIdx = ix;
5574
5575 // Save State to the URX_LOOP_C op that follows this one,
5576 // so that match failures in the following code will return to there.
5577 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
5578 fp = StateSave(fp, fp->fPatIdx, status);
5579 fp->fPatIdx++;
5580 }
5581 break;
5582
5583
5584 case URX_LOOP_DOT_I:
5585 // Loop Initialization for the optimized implementation of .*
5586 // This op scans through all remaining input.
5587 // The following LOOP_C op emulates stack unwinding if the following pattern fails.
5588 {
5589 // Loop through input until the input is exhausted (we reach an end-of-line)
5590 // In DOTALL mode, we can just go straight to the end of the input.
5591 int32_t ix;
5592 if ((opValue & 1) == 1) {
5593 // Dot-matches-All mode. Jump straight to the end of the string.
5594 ix = (int32_t)fActiveLimit;
5595 fHitEnd = true;
5596 } else {
5597 // NOT DOT ALL mode. Line endings do not match '.'
5598 // Scan forward until a line ending or end of input.
5599 ix = (int32_t)fp->fInputIdx;
5600 for (;;) {
5601 if (ix >= fActiveLimit) {
5602 fHitEnd = true;
5603 break;
5604 }
5605 UChar32 c;
5606 U16_NEXT(inputBuf, ix, fActiveLimit, c); // c = inputBuf[ix++]
5607 if ((c & 0x7f) <= 0x29) { // Fast filter of non-new-line-s
5608 if ((c == 0x0a) || // 0x0a is newline in both modes.
5609 (((opValue & 2) == 0) && // IF not UNIX_LINES mode
5610 isLineTerminator(c))) {
5611 // char is a line ending. Put the input pos back to the
5612 // line ending char, and exit the scanning loop.
5613 U16_BACK_1(inputBuf, 0, ix);
5614 break;
5615 }
5616 }
5617 }
5618 }
5619
5620 // If there were no matching characters, skip over the loop altogether.
5621 // The loop doesn't run at all, a * op always succeeds.
5622 if (ix == fp->fInputIdx) {
5623 fp->fPatIdx++; // skip the URX_LOOP_C op.
5624 break;
5625 }
5626
5627 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
5628 // must follow. It's operand is the stack location
5629 // that holds the starting input index for the match of this .*
5630 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
5631 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
5632 int32_t stackLoc = URX_VAL(loopcOp);
5633 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
5634 fp->fExtra[stackLoc] = fp->fInputIdx;
5635 fp->fInputIdx = ix;
5636
5637 // Save State to the URX_LOOP_C op that follows this one,
5638 // so that match failures in the following code will return to there.
5639 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
5640 fp = StateSave(fp, fp->fPatIdx, status);
5641 fp->fPatIdx++;
5642 }
5643 break;
5644
5645
5646 case URX_LOOP_C:
5647 {
5648 U_ASSERT(opValue>=0 && opValue<fFrameSize);
5649 backSearchIndex = (int32_t)fp->fExtra[opValue];
5650 U_ASSERT(backSearchIndex <= fp->fInputIdx);
5651 if (backSearchIndex == fp->fInputIdx) {
5652 // We've backed up the input idx to the point that the loop started.
5653 // The loop is done. Leave here without saving state.
5654 // Subsequent failures won't come back here.
5655 break;
5656 }
5657 // Set up for the next iteration of the loop, with input index
5658 // backed up by one from the last time through,
5659 // and a state save to this instruction in case the following code fails again.
5660 // (We're going backwards because this loop emulates stack unwinding, not
5661 // the initial scan forward.)
5662 U_ASSERT(fp->fInputIdx > 0);
5663 UChar32 prevC;
5664 U16_PREV(inputBuf, 0, fp->fInputIdx, prevC); // !!!: should this 0 be one of f*Limit?
5665
5666 if (prevC == 0x0a &&
5667 fp->fInputIdx > backSearchIndex &&
5668 inputBuf[fp->fInputIdx-1] == 0x0d) {
5669 int32_t prevOp = (int32_t)pat[fp->fPatIdx-2];
5670 if (URX_TYPE(prevOp) == URX_LOOP_DOT_I) {
5671 // .*, stepping back over CRLF pair.
5672 U16_BACK_1(inputBuf, 0, fp->fInputIdx);
5673 }
5674 }
5675
5676
5677 fp = StateSave(fp, fp->fPatIdx-1, status);
5678 }
5679 break;
5680
5681
5682
5683 default:
5684 // Trouble. The compiled pattern contains an entry with an
5685 // unrecognized type tag.
5686 UPRV_UNREACHABLE_ASSERT;
5687 // Unknown opcode type in opType = URX_TYPE(pat[fp->fPatIdx]). But we have
5688 // reports of this in production code, don't use UPRV_UNREACHABLE_EXIT.
5689 // See ICU-21669.
5690 status = U_INTERNAL_PROGRAM_ERROR;
5691 }
5692
5693 if (U_FAILURE(status)) {
5694 isMatch = false;
5695 break;
5696 }
5697 }
5698
5699 breakFromLoop:
5700 fMatch = isMatch;
5701 if (isMatch) {
5702 fLastMatchEnd = fMatchEnd;
5703 fMatchStart = startIdx;
5704 fMatchEnd = fp->fInputIdx;
5705 }
5706
5707 #ifdef REGEX_RUN_DEBUG
5708 if (fTraceDebug) {
5709 if (isMatch) {
5710 printf("Match. start=%ld end=%ld\n\n", fMatchStart, fMatchEnd);
5711 } else {
5712 printf("No match\n\n");
5713 }
5714 }
5715 #endif
5716
5717 fFrame = fp; // The active stack frame when the engine stopped.
5718 // Contains the capture group results that we need to
5719 // access later.
5720 }
5721
5722
5723 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RegexMatcher)
5724
5725 U_NAMESPACE_END
5726
5727 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS
5728
5729