1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 //
4 // file: regexcmp.cpp
5 //
6 // Copyright (C) 2002-2016 International Business Machines Corporation and others.
7 // All Rights Reserved.
8 //
9 // This file contains the ICU regular expression compiler, which is responsible
10 // for processing a regular expression pattern into the compiled form that
11 // is used by the match finding engine.
12 //
13
14 #include "unicode/utypes.h"
15
16 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
17
18 #include "unicode/ustring.h"
19 #include "unicode/unistr.h"
20 #include "unicode/uniset.h"
21 #include "unicode/uchar.h"
22 #include "unicode/uchriter.h"
23 #include "unicode/parsepos.h"
24 #include "unicode/parseerr.h"
25 #include "unicode/regex.h"
26 #include "unicode/utf.h"
27 #include "unicode/utf16.h"
28 #include "patternprops.h"
29 #include "putilimp.h"
30 #include "cmemory.h"
31 #include "cstr.h"
32 #include "cstring.h"
33 #include "uvectr32.h"
34 #include "uvectr64.h"
35 #include "uassert.h"
36 #include "uinvchar.h"
37
38 #include "regeximp.h"
39 #include "regexcst.h" // Contains state table for the regex pattern parser.
40 // generated by a Perl script.
41 #include "regexcmp.h"
42 #include "regexst.h"
43 #include "regextxt.h"
44
45
46
47 U_NAMESPACE_BEGIN
48
49
50 //------------------------------------------------------------------------------
51 //
52 // Constructor.
53 //
54 //------------------------------------------------------------------------------
RegexCompile(RegexPattern * rxp,UErrorCode & status)55 RegexCompile::RegexCompile(RegexPattern *rxp, UErrorCode &status) :
56 fParenStack(status), fSetStack(status), fSetOpStack(status)
57 {
58 // Lazy init of all shared global sets (needed for init()'s empty text)
59 RegexStaticSets::initGlobals(&status);
60
61 fStatus = &status;
62
63 fRXPat = rxp;
64 fScanIndex = 0;
65 fLastChar = -1;
66 fPeekChar = -1;
67 fLineNum = 1;
68 fCharNum = 0;
69 fQuoteMode = FALSE;
70 fInBackslashQuote = FALSE;
71 fModeFlags = fRXPat->fFlags | 0x80000000;
72 fEOLComments = TRUE;
73
74 fMatchOpenParen = -1;
75 fMatchCloseParen = -1;
76 fCaptureName = NULL;
77 fLastSetLiteral = U_SENTINEL;
78
79 if (U_SUCCESS(status) && U_FAILURE(rxp->fDeferredStatus)) {
80 status = rxp->fDeferredStatus;
81 }
82 }
83
84 static const UChar chAmp = 0x26; // '&'
85 static const UChar chDash = 0x2d; // '-'
86
87
88 //------------------------------------------------------------------------------
89 //
90 // Destructor
91 //
92 //------------------------------------------------------------------------------
~RegexCompile()93 RegexCompile::~RegexCompile() {
94 delete fCaptureName; // Normally will be NULL, but can exist if pattern
95 // compilation stops with a syntax error.
96 }
97
addCategory(UnicodeSet * set,int32_t value,UErrorCode & ec)98 static inline void addCategory(UnicodeSet *set, int32_t value, UErrorCode& ec) {
99 set->addAll(UnicodeSet().applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, value, ec));
100 }
101
102 //------------------------------------------------------------------------------
103 //
104 // Compile regex pattern. The state machine for rexexp pattern parsing is here.
105 // The state tables are hand-written in the file regexcst.txt,
106 // and converted to the form used here by a perl
107 // script regexcst.pl
108 //
109 //------------------------------------------------------------------------------
compile(const UnicodeString & pat,UParseError & pp,UErrorCode & e)110 void RegexCompile::compile(
111 const UnicodeString &pat, // Source pat to be compiled.
112 UParseError &pp, // Error position info
113 UErrorCode &e) // Error Code
114 {
115 fRXPat->fPatternString = new UnicodeString(pat);
116 UText patternText = UTEXT_INITIALIZER;
117 utext_openConstUnicodeString(&patternText, fRXPat->fPatternString, &e);
118
119 if (U_SUCCESS(e)) {
120 compile(&patternText, pp, e);
121 utext_close(&patternText);
122 }
123 }
124
125 //
126 // compile, UText mode
127 // All the work is actually done here.
128 //
compile(UText * pat,UParseError & pp,UErrorCode & e)129 void RegexCompile::compile(
130 UText *pat, // Source pat to be compiled.
131 UParseError &pp, // Error position info
132 UErrorCode &e) // Error Code
133 {
134 fStatus = &e;
135 fParseErr = &pp;
136 fStackPtr = 0;
137 fStack[fStackPtr] = 0;
138
139 if (U_FAILURE(*fStatus)) {
140 return;
141 }
142
143 // There should be no pattern stuff in the RegexPattern object. They can not be reused.
144 U_ASSERT(fRXPat->fPattern == NULL || utext_nativeLength(fRXPat->fPattern) == 0);
145
146 // Prepare the RegexPattern object to receive the compiled pattern.
147 fRXPat->fPattern = utext_clone(fRXPat->fPattern, pat, FALSE, TRUE, fStatus);
148 if (U_FAILURE(*fStatus)) {
149 return;
150 }
151
152 // Initialize the pattern scanning state machine
153 fPatternLength = utext_nativeLength(pat);
154 uint16_t state = 1;
155 const RegexTableEl *tableEl;
156
157 // UREGEX_LITERAL force entire pattern to be treated as a literal string.
158 if (fModeFlags & UREGEX_LITERAL) {
159 fQuoteMode = TRUE;
160 }
161
162 nextChar(fC); // Fetch the first char from the pattern string.
163
164 //
165 // Main loop for the regex pattern parsing state machine.
166 // Runs once per state transition.
167 // Each time through optionally performs, depending on the state table,
168 // - an advance to the the next pattern char
169 // - an action to be performed.
170 // - pushing or popping a state to/from the local state return stack.
171 // file regexcst.txt is the source for the state table. The logic behind
172 // recongizing the pattern syntax is there, not here.
173 //
174 for (;;) {
175 // Bail out if anything has gone wrong.
176 // Regex pattern parsing stops on the first error encountered.
177 if (U_FAILURE(*fStatus)) {
178 break;
179 }
180
181 U_ASSERT(state != 0);
182
183 // Find the state table element that matches the input char from the pattern, or the
184 // class of the input character. Start with the first table row for this
185 // state, then linearly scan forward until we find a row that matches the
186 // character. The last row for each state always matches all characters, so
187 // the search will stop there, if not before.
188 //
189 tableEl = &gRuleParseStateTable[state];
190 REGEX_SCAN_DEBUG_PRINTF(("char, line, col = (\'%c\', %d, %d) state=%s ",
191 fC.fChar, fLineNum, fCharNum, RegexStateNames[state]));
192
193 for (;;) { // loop through table rows belonging to this state, looking for one
194 // that matches the current input char.
195 REGEX_SCAN_DEBUG_PRINTF(("."));
196 if (tableEl->fCharClass < 127 && fC.fQuoted == FALSE && tableEl->fCharClass == fC.fChar) {
197 // Table row specified an individual character, not a set, and
198 // the input character is not quoted, and
199 // the input character matched it.
200 break;
201 }
202 if (tableEl->fCharClass == 255) {
203 // Table row specified default, match anything character class.
204 break;
205 }
206 if (tableEl->fCharClass == 254 && fC.fQuoted) {
207 // Table row specified "quoted" and the char was quoted.
208 break;
209 }
210 if (tableEl->fCharClass == 253 && fC.fChar == (UChar32)-1) {
211 // Table row specified eof and we hit eof on the input.
212 break;
213 }
214
215 if (tableEl->fCharClass >= 128 && tableEl->fCharClass < 240 && // Table specs a char class &&
216 fC.fQuoted == FALSE && // char is not escaped &&
217 fC.fChar != (UChar32)-1) { // char is not EOF
218 U_ASSERT(tableEl->fCharClass <= 137);
219 if (RegexStaticSets::gStaticSets->fRuleSets[tableEl->fCharClass-128].contains(fC.fChar)) {
220 // Table row specified a character class, or set of characters,
221 // and the current char matches it.
222 break;
223 }
224 }
225
226 // No match on this row, advance to the next row for this state,
227 tableEl++;
228 }
229 REGEX_SCAN_DEBUG_PRINTF(("\n"));
230
231 //
232 // We've found the row of the state table that matches the current input
233 // character from the rules string.
234 // Perform any action specified by this row in the state table.
235 if (doParseActions(tableEl->fAction) == FALSE) {
236 // Break out of the state machine loop if the
237 // the action signalled some kind of error, or
238 // the action was to exit, occurs on normal end-of-rules-input.
239 break;
240 }
241
242 if (tableEl->fPushState != 0) {
243 fStackPtr++;
244 if (fStackPtr >= kStackSize) {
245 error(U_REGEX_INTERNAL_ERROR);
246 REGEX_SCAN_DEBUG_PRINTF(("RegexCompile::parse() - state stack overflow.\n"));
247 fStackPtr--;
248 }
249 fStack[fStackPtr] = tableEl->fPushState;
250 }
251
252 //
253 // NextChar. This is where characters are actually fetched from the pattern.
254 // Happens under control of the 'n' tag in the state table.
255 //
256 if (tableEl->fNextChar) {
257 nextChar(fC);
258 }
259
260 // Get the next state from the table entry, or from the
261 // state stack if the next state was specified as "pop".
262 if (tableEl->fNextState != 255) {
263 state = tableEl->fNextState;
264 } else {
265 state = fStack[fStackPtr];
266 fStackPtr--;
267 if (fStackPtr < 0) {
268 // state stack underflow
269 // This will occur if the user pattern has mis-matched parentheses,
270 // with extra close parens.
271 //
272 fStackPtr++;
273 error(U_REGEX_MISMATCHED_PAREN);
274 }
275 }
276
277 }
278
279 if (U_FAILURE(*fStatus)) {
280 // Bail out if the pattern had errors.
281 // Set stack cleanup: a successful compile would have left it empty,
282 // but errors can leave temporary sets hanging around.
283 while (!fSetStack.empty()) {
284 delete (UnicodeSet *)fSetStack.pop();
285 }
286 return;
287 }
288
289 //
290 // The pattern has now been read and processed, and the compiled code generated.
291 //
292
293 //
294 // The pattern's fFrameSize so far has accumulated the requirements for
295 // storage for capture parentheses, counters, etc. that are encountered
296 // in the pattern. Add space for the two variables that are always
297 // present in the saved state: the input string position (int64_t) and
298 // the position in the compiled pattern.
299 //
300 allocateStackData(RESTACKFRAME_HDRCOUNT);
301
302 //
303 // Optimization pass 1: NOPs, back-references, and case-folding
304 //
305 stripNOPs();
306
307 //
308 // Get bounds for the minimum and maximum length of a string that this
309 // pattern can match. Used to avoid looking for matches in strings that
310 // are too short.
311 //
312 fRXPat->fMinMatchLen = minMatchLength(3, fRXPat->fCompiledPat->size()-1);
313
314 //
315 // Optimization pass 2: match start type
316 //
317 matchStartType();
318
319 //
320 // Set up fast latin-1 range sets
321 //
322 int32_t numSets = fRXPat->fSets->size();
323 fRXPat->fSets8 = new Regex8BitSet[numSets];
324 // Null pointer check.
325 if (fRXPat->fSets8 == NULL) {
326 e = *fStatus = U_MEMORY_ALLOCATION_ERROR;
327 return;
328 }
329 int32_t i;
330 for (i=0; i<numSets; i++) {
331 UnicodeSet *s = (UnicodeSet *)fRXPat->fSets->elementAt(i);
332 fRXPat->fSets8[i].init(s);
333 }
334
335 }
336
337
338
339
340
341 //------------------------------------------------------------------------------
342 //
343 // doParseAction Do some action during regex pattern parsing.
344 // Called by the parse state machine.
345 //
346 // Generation of the match engine PCode happens here, or
347 // in functions called from the parse actions defined here.
348 //
349 //
350 //------------------------------------------------------------------------------
doParseActions(int32_t action)351 UBool RegexCompile::doParseActions(int32_t action)
352 {
353 UBool returnVal = TRUE;
354
355 switch ((Regex_PatternParseAction)action) {
356
357 case doPatStart:
358 // Start of pattern compiles to:
359 //0 SAVE 2 Fall back to position of FAIL
360 //1 jmp 3
361 //2 FAIL Stop if we ever reach here.
362 //3 NOP Dummy, so start of pattern looks the same as
363 // the start of an ( grouping.
364 //4 NOP Resreved, will be replaced by a save if there are
365 // OR | operators at the top level
366 appendOp(URX_STATE_SAVE, 2);
367 appendOp(URX_JMP, 3);
368 appendOp(URX_FAIL, 0);
369
370 // Standard open nonCapture paren action emits the two NOPs and
371 // sets up the paren stack frame.
372 doParseActions(doOpenNonCaptureParen);
373 break;
374
375 case doPatFinish:
376 // We've scanned to the end of the pattern
377 // The end of pattern compiles to:
378 // URX_END
379 // which will stop the runtime match engine.
380 // Encountering end of pattern also behaves like a close paren,
381 // and forces fixups of the State Save at the beginning of the compiled pattern
382 // and of any OR operations at the top level.
383 //
384 handleCloseParen();
385 if (fParenStack.size() > 0) {
386 // Missing close paren in pattern.
387 error(U_REGEX_MISMATCHED_PAREN);
388 }
389
390 // add the END operation to the compiled pattern.
391 appendOp(URX_END, 0);
392
393 // Terminate the pattern compilation state machine.
394 returnVal = FALSE;
395 break;
396
397
398
399 case doOrOperator:
400 // Scanning a '|', as in (A|B)
401 {
402 // Generate code for any pending literals preceding the '|'
403 fixLiterals(FALSE);
404
405 // Insert a SAVE operation at the start of the pattern section preceding
406 // this OR at this level. This SAVE will branch the match forward
407 // to the right hand side of the OR in the event that the left hand
408 // side fails to match and backtracks. Locate the position for the
409 // save from the location on the top of the parentheses stack.
410 int32_t savePosition = fParenStack.popi();
411 int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(savePosition);
412 U_ASSERT(URX_TYPE(op) == URX_NOP); // original contents of reserved location
413 op = buildOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+1);
414 fRXPat->fCompiledPat->setElementAt(op, savePosition);
415
416 // Append an JMP operation into the compiled pattern. The operand for
417 // the JMP will eventually be the location following the ')' for the
418 // group. This will be patched in later, when the ')' is encountered.
419 appendOp(URX_JMP, 0);
420
421 // Push the position of the newly added JMP op onto the parentheses stack.
422 // This registers if for fixup when this block's close paren is encountered.
423 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);
424
425 // Append a NOP to the compiled pattern. This is the slot reserved
426 // for a SAVE in the event that there is yet another '|' following
427 // this one.
428 appendOp(URX_NOP, 0);
429 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);
430 }
431 break;
432
433
434 case doBeginNamedCapture:
435 // Scanning (?<letter.
436 // The first letter of the name will come through again under doConinueNamedCapture.
437 fCaptureName = new UnicodeString();
438 if (fCaptureName == NULL) {
439 error(U_MEMORY_ALLOCATION_ERROR);
440 }
441 break;
442
443 case doContinueNamedCapture:
444 fCaptureName->append(fC.fChar);
445 break;
446
447 case doBadNamedCapture:
448 error(U_REGEX_INVALID_CAPTURE_GROUP_NAME);
449 break;
450
451 case doOpenCaptureParen:
452 // Open Capturing Paren, possibly named.
453 // Compile to a
454 // - NOP, which later may be replaced by a save-state if the
455 // parenthesized group gets a * quantifier, followed by
456 // - START_CAPTURE n where n is stack frame offset to the capture group variables.
457 // - NOP, which may later be replaced by a save-state if there
458 // is an '|' alternation within the parens.
459 //
460 // Each capture group gets three slots in the save stack frame:
461 // 0: Capture Group start position (in input string being matched.)
462 // 1: Capture Group end position.
463 // 2: Start of Match-in-progress.
464 // The first two locations are for a completed capture group, and are
465 // referred to by back references and the like.
466 // The third location stores the capture start position when an START_CAPTURE is
467 // encountered. This will be promoted to a completed capture when (and if) the corresponding
468 // END_CAPTURE is encountered.
469 {
470 fixLiterals();
471 appendOp(URX_NOP, 0);
472 int32_t varsLoc = allocateStackData(3); // Reserve three slots in match stack frame.
473 appendOp(URX_START_CAPTURE, varsLoc);
474 appendOp(URX_NOP, 0);
475
476 // On the Parentheses stack, start a new frame and add the postions
477 // of the two NOPs. Depending on what follows in the pattern, the
478 // NOPs may be changed to SAVE_STATE or JMP ops, with a target
479 // address of the end of the parenthesized group.
480 fParenStack.push(fModeFlags, *fStatus); // Match mode state
481 fParenStack.push(capturing, *fStatus); // Frame type.
482 fParenStack.push(fRXPat->fCompiledPat->size()-3, *fStatus); // The first NOP location
483 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP loc
484
485 // Save the mapping from group number to stack frame variable position.
486 fRXPat->fGroupMap->addElement(varsLoc, *fStatus);
487
488 // If this is a named capture group, add the name->group number mapping.
489 if (fCaptureName != NULL) {
490 if (!fRXPat->initNamedCaptureMap()) {
491 if (U_SUCCESS(*fStatus)) {
492 error(fRXPat->fDeferredStatus);
493 }
494 break;
495 }
496 int32_t groupNumber = fRXPat->fGroupMap->size();
497 int32_t previousMapping = uhash_puti(fRXPat->fNamedCaptureMap, fCaptureName, groupNumber, fStatus);
498 fCaptureName = NULL; // hash table takes ownership of the name (key) string.
499 if (previousMapping > 0 && U_SUCCESS(*fStatus)) {
500 error(U_REGEX_INVALID_CAPTURE_GROUP_NAME);
501 }
502 }
503 }
504 break;
505
506 case doOpenNonCaptureParen:
507 // Open non-caputuring (grouping only) Paren.
508 // Compile to a
509 // - NOP, which later may be replaced by a save-state if the
510 // parenthesized group gets a * quantifier, followed by
511 // - NOP, which may later be replaced by a save-state if there
512 // is an '|' alternation within the parens.
513 {
514 fixLiterals();
515 appendOp(URX_NOP, 0);
516 appendOp(URX_NOP, 0);
517
518 // On the Parentheses stack, start a new frame and add the postions
519 // of the two NOPs.
520 fParenStack.push(fModeFlags, *fStatus); // Match mode state
521 fParenStack.push(plain, *fStatus); // Begin a new frame.
522 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The first NOP location
523 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP loc
524 }
525 break;
526
527
528 case doOpenAtomicParen:
529 // Open Atomic Paren. (?>
530 // Compile to a
531 // - NOP, which later may be replaced if the parenthesized group
532 // has a quantifier, followed by
533 // - STO_SP save state stack position, so it can be restored at the ")"
534 // - NOP, which may later be replaced by a save-state if there
535 // is an '|' alternation within the parens.
536 {
537 fixLiterals();
538 appendOp(URX_NOP, 0);
539 int32_t varLoc = allocateData(1); // Reserve a data location for saving the state stack ptr.
540 appendOp(URX_STO_SP, varLoc);
541 appendOp(URX_NOP, 0);
542
543 // On the Parentheses stack, start a new frame and add the postions
544 // of the two NOPs. Depending on what follows in the pattern, the
545 // NOPs may be changed to SAVE_STATE or JMP ops, with a target
546 // address of the end of the parenthesized group.
547 fParenStack.push(fModeFlags, *fStatus); // Match mode state
548 fParenStack.push(atomic, *fStatus); // Frame type.
549 fParenStack.push(fRXPat->fCompiledPat->size()-3, *fStatus); // The first NOP
550 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP
551 }
552 break;
553
554
555 case doOpenLookAhead:
556 // Positive Look-ahead (?= stuff )
557 //
558 // Note: Addition of transparent input regions, with the need to
559 // restore the original regions when failing out of a lookahead
560 // block, complicated this sequence. Some conbined opcodes
561 // might make sense - or might not, lookahead aren't that common.
562 //
563 // Caution: min match length optimization knows about this
564 // sequence; don't change without making updates there too.
565 //
566 // Compiles to
567 // 1 LA_START dataLoc Saves SP, Input Pos, Active input region.
568 // 2. STATE_SAVE 4 on failure of lookahead, goto 4
569 // 3 JMP 6 continue ...
570 //
571 // 4. LA_END Look Ahead failed. Restore regions.
572 // 5. BACKTRACK and back track again.
573 //
574 // 6. NOP reserved for use by quantifiers on the block.
575 // Look-ahead can't have quantifiers, but paren stack
576 // compile time conventions require the slot anyhow.
577 // 7. NOP may be replaced if there is are '|' ops in the block.
578 // 8. code for parenthesized stuff.
579 // 9. LA_END
580 //
581 // Four data slots are reserved, for saving state on entry to the look-around
582 // 0: stack pointer on entry.
583 // 1: input position on entry.
584 // 2: fActiveStart, the active bounds start on entry.
585 // 3: fActiveLimit, the active bounds limit on entry.
586 {
587 fixLiterals();
588 int32_t dataLoc = allocateData(4);
589 appendOp(URX_LA_START, dataLoc);
590 appendOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+ 2);
591 appendOp(URX_JMP, fRXPat->fCompiledPat->size()+ 3);
592 appendOp(URX_LA_END, dataLoc);
593 appendOp(URX_BACKTRACK, 0);
594 appendOp(URX_NOP, 0);
595 appendOp(URX_NOP, 0);
596
597 // On the Parentheses stack, start a new frame and add the postions
598 // of the NOPs.
599 fParenStack.push(fModeFlags, *fStatus); // Match mode state
600 fParenStack.push(lookAhead, *fStatus); // Frame type.
601 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The first NOP location
602 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP location
603 }
604 break;
605
606 case doOpenLookAheadNeg:
607 // Negated Lookahead. (?! stuff )
608 // Compiles to
609 // 1. LA_START dataloc
610 // 2. SAVE_STATE 7 // Fail within look-ahead block restores to this state,
611 // // which continues with the match.
612 // 3. NOP // Std. Open Paren sequence, for possible '|'
613 // 4. code for parenthesized stuff.
614 // 5. LA_END // Cut back stack, remove saved state from step 2.
615 // 6. BACKTRACK // code in block succeeded, so neg. lookahead fails.
616 // 7. END_LA // Restore match region, in case look-ahead was using
617 // an alternate (transparent) region.
618 // Four data slots are reserved, for saving state on entry to the look-around
619 // 0: stack pointer on entry.
620 // 1: input position on entry.
621 // 2: fActiveStart, the active bounds start on entry.
622 // 3: fActiveLimit, the active bounds limit on entry.
623 {
624 fixLiterals();
625 int32_t dataLoc = allocateData(4);
626 appendOp(URX_LA_START, dataLoc);
627 appendOp(URX_STATE_SAVE, 0); // dest address will be patched later.
628 appendOp(URX_NOP, 0);
629
630 // On the Parentheses stack, start a new frame and add the postions
631 // of the StateSave and NOP.
632 fParenStack.push(fModeFlags, *fStatus); // Match mode state
633 fParenStack.push(negLookAhead, *fStatus); // Frame type
634 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The STATE_SAVE location
635 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP location
636
637 // Instructions #5 - #7 will be added when the ')' is encountered.
638 }
639 break;
640
641 case doOpenLookBehind:
642 {
643 // Compile a (?<= look-behind open paren.
644 //
645 // Compiles to
646 // 0 URX_LB_START dataLoc
647 // 1 URX_LB_CONT dataLoc
648 // 2 MinMatchLen
649 // 3 MaxMatchLen
650 // 4 URX_NOP Standard '(' boilerplate.
651 // 5 URX_NOP Reserved slot for use with '|' ops within (block).
652 // 6 <code for LookBehind expression>
653 // 7 URX_LB_END dataLoc # Check match len, restore input len
654 // 8 URX_LA_END dataLoc # Restore stack, input pos
655 //
656 // Allocate a block of matcher data, to contain (when running a match)
657 // 0: Stack ptr on entry
658 // 1: Input Index on entry
659 // 2: fActiveStart, the active bounds start on entry.
660 // 3: fActiveLimit, the active bounds limit on entry.
661 // 4: Start index of match current match attempt.
662 // The first four items must match the layout of data for LA_START / LA_END
663
664 // Generate match code for any pending literals.
665 fixLiterals();
666
667 // Allocate data space
668 int32_t dataLoc = allocateData(5);
669
670 // Emit URX_LB_START
671 appendOp(URX_LB_START, dataLoc);
672
673 // Emit URX_LB_CONT
674 appendOp(URX_LB_CONT, dataLoc);
675 appendOp(URX_RESERVED_OP, 0); // MinMatchLength. To be filled later.
676 appendOp(URX_RESERVED_OP, 0); // MaxMatchLength. To be filled later.
677
678 // Emit the NOPs
679 appendOp(URX_NOP, 0);
680 appendOp(URX_NOP, 0);
681
682 // On the Parentheses stack, start a new frame and add the postions
683 // of the URX_LB_CONT and the NOP.
684 fParenStack.push(fModeFlags, *fStatus); // Match mode state
685 fParenStack.push(lookBehind, *fStatus); // Frame type
686 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The first NOP location
687 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The 2nd NOP location
688
689 // The final two instructions will be added when the ')' is encountered.
690 }
691
692 break;
693
694 case doOpenLookBehindNeg:
695 {
696 // Compile a (?<! negated look-behind open paren.
697 //
698 // Compiles to
699 // 0 URX_LB_START dataLoc # Save entry stack, input len
700 // 1 URX_LBN_CONT dataLoc # Iterate possible match positions
701 // 2 MinMatchLen
702 // 3 MaxMatchLen
703 // 4 continueLoc (9)
704 // 5 URX_NOP Standard '(' boilerplate.
705 // 6 URX_NOP Reserved slot for use with '|' ops within (block).
706 // 7 <code for LookBehind expression>
707 // 8 URX_LBN_END dataLoc # Check match len, cause a FAIL
708 // 9 ...
709 //
710 // Allocate a block of matcher data, to contain (when running a match)
711 // 0: Stack ptr on entry
712 // 1: Input Index on entry
713 // 2: fActiveStart, the active bounds start on entry.
714 // 3: fActiveLimit, the active bounds limit on entry.
715 // 4: Start index of match current match attempt.
716 // The first four items must match the layout of data for LA_START / LA_END
717
718 // Generate match code for any pending literals.
719 fixLiterals();
720
721 // Allocate data space
722 int32_t dataLoc = allocateData(5);
723
724 // Emit URX_LB_START
725 appendOp(URX_LB_START, dataLoc);
726
727 // Emit URX_LBN_CONT
728 appendOp(URX_LBN_CONT, dataLoc);
729 appendOp(URX_RESERVED_OP, 0); // MinMatchLength. To be filled later.
730 appendOp(URX_RESERVED_OP, 0); // MaxMatchLength. To be filled later.
731 appendOp(URX_RESERVED_OP, 0); // Continue Loc. To be filled later.
732
733 // Emit the NOPs
734 appendOp(URX_NOP, 0);
735 appendOp(URX_NOP, 0);
736
737 // On the Parentheses stack, start a new frame and add the postions
738 // of the URX_LB_CONT and the NOP.
739 fParenStack.push(fModeFlags, *fStatus); // Match mode state
740 fParenStack.push(lookBehindN, *fStatus); // Frame type
741 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The first NOP location
742 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The 2nd NOP location
743
744 // The final two instructions will be added when the ')' is encountered.
745 }
746 break;
747
748 case doConditionalExpr:
749 // Conditionals such as (?(1)a:b)
750 case doPerlInline:
751 // Perl inline-condtionals. (?{perl code}a|b) We're not perl, no way to do them.
752 error(U_REGEX_UNIMPLEMENTED);
753 break;
754
755
756 case doCloseParen:
757 handleCloseParen();
758 if (fParenStack.size() <= 0) {
759 // Extra close paren, or missing open paren.
760 error(U_REGEX_MISMATCHED_PAREN);
761 }
762 break;
763
764 case doNOP:
765 break;
766
767
768 case doBadOpenParenType:
769 case doRuleError:
770 error(U_REGEX_RULE_SYNTAX);
771 break;
772
773
774 case doMismatchedParenErr:
775 error(U_REGEX_MISMATCHED_PAREN);
776 break;
777
778 case doPlus:
779 // Normal '+' compiles to
780 // 1. stuff to be repeated (already built)
781 // 2. jmp-sav 1
782 // 3. ...
783 //
784 // Or, if the item to be repeated can match a zero length string,
785 // 1. STO_INP_LOC data-loc
786 // 2. body of stuff to be repeated
787 // 3. JMP_SAV_X 2
788 // 4. ...
789
790 //
791 // Or, if the item to be repeated is simple
792 // 1. Item to be repeated.
793 // 2. LOOP_SR_I set number (assuming repeated item is a set ref)
794 // 3. LOOP_C stack location
795 {
796 int32_t topLoc = blockTopLoc(FALSE); // location of item #1
797 int32_t frameLoc;
798
799 // Check for simple constructs, which may get special optimized code.
800 if (topLoc == fRXPat->fCompiledPat->size() - 1) {
801 int32_t repeatedOp = (int32_t)fRXPat->fCompiledPat->elementAti(topLoc);
802
803 if (URX_TYPE(repeatedOp) == URX_SETREF) {
804 // Emit optimized code for [char set]+
805 appendOp(URX_LOOP_SR_I, URX_VAL(repeatedOp));
806 frameLoc = allocateStackData(1);
807 appendOp(URX_LOOP_C, frameLoc);
808 break;
809 }
810
811 if (URX_TYPE(repeatedOp) == URX_DOTANY ||
812 URX_TYPE(repeatedOp) == URX_DOTANY_ALL ||
813 URX_TYPE(repeatedOp) == URX_DOTANY_UNIX) {
814 // Emit Optimized code for .+ operations.
815 int32_t loopOpI = buildOp(URX_LOOP_DOT_I, 0);
816 if (URX_TYPE(repeatedOp) == URX_DOTANY_ALL) {
817 // URX_LOOP_DOT_I operand is a flag indicating ". matches any" mode.
818 loopOpI |= 1;
819 }
820 if (fModeFlags & UREGEX_UNIX_LINES) {
821 loopOpI |= 2;
822 }
823 appendOp(loopOpI);
824 frameLoc = allocateStackData(1);
825 appendOp(URX_LOOP_C, frameLoc);
826 break;
827 }
828
829 }
830
831 // General case.
832
833 // Check for minimum match length of zero, which requires
834 // extra loop-breaking code.
835 if (minMatchLength(topLoc, fRXPat->fCompiledPat->size()-1) == 0) {
836 // Zero length match is possible.
837 // Emit the code sequence that can handle it.
838 insertOp(topLoc);
839 frameLoc = allocateStackData(1);
840
841 int32_t op = buildOp(URX_STO_INP_LOC, frameLoc);
842 fRXPat->fCompiledPat->setElementAt(op, topLoc);
843
844 appendOp(URX_JMP_SAV_X, topLoc+1);
845 } else {
846 // Simpler code when the repeated body must match something non-empty
847 appendOp(URX_JMP_SAV, topLoc);
848 }
849 }
850 break;
851
852 case doNGPlus:
853 // Non-greedy '+?' compiles to
854 // 1. stuff to be repeated (already built)
855 // 2. state-save 1
856 // 3. ...
857 {
858 int32_t topLoc = blockTopLoc(FALSE);
859 appendOp(URX_STATE_SAVE, topLoc);
860 }
861 break;
862
863
864 case doOpt:
865 // Normal (greedy) ? quantifier.
866 // Compiles to
867 // 1. state save 3
868 // 2. body of optional block
869 // 3. ...
870 // Insert the state save into the compiled pattern, and we're done.
871 {
872 int32_t saveStateLoc = blockTopLoc(TRUE);
873 int32_t saveStateOp = buildOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size());
874 fRXPat->fCompiledPat->setElementAt(saveStateOp, saveStateLoc);
875 }
876 break;
877
878 case doNGOpt:
879 // Non-greedy ?? quantifier
880 // compiles to
881 // 1. jmp 4
882 // 2. body of optional block
883 // 3 jmp 5
884 // 4. state save 2
885 // 5 ...
886 // This code is less than ideal, with two jmps instead of one, because we can only
887 // insert one instruction at the top of the block being iterated.
888 {
889 int32_t jmp1_loc = blockTopLoc(TRUE);
890 int32_t jmp2_loc = fRXPat->fCompiledPat->size();
891
892 int32_t jmp1_op = buildOp(URX_JMP, jmp2_loc+1);
893 fRXPat->fCompiledPat->setElementAt(jmp1_op, jmp1_loc);
894
895 appendOp(URX_JMP, jmp2_loc+2);
896
897 appendOp(URX_STATE_SAVE, jmp1_loc+1);
898 }
899 break;
900
901
902 case doStar:
903 // Normal (greedy) * quantifier.
904 // Compiles to
905 // 1. STATE_SAVE 4
906 // 2. body of stuff being iterated over
907 // 3. JMP_SAV 2
908 // 4. ...
909 //
910 // Or, if the body is a simple [Set],
911 // 1. LOOP_SR_I set number
912 // 2. LOOP_C stack location
913 // ...
914 //
915 // Or if this is a .*
916 // 1. LOOP_DOT_I (. matches all mode flag)
917 // 2. LOOP_C stack location
918 //
919 // Or, if the body can match a zero-length string, to inhibit infinite loops,
920 // 1. STATE_SAVE 5
921 // 2. STO_INP_LOC data-loc
922 // 3. body of stuff
923 // 4. JMP_SAV_X 2
924 // 5. ...
925 {
926 // location of item #1, the STATE_SAVE
927 int32_t topLoc = blockTopLoc(FALSE);
928 int32_t dataLoc = -1;
929
930 // Check for simple *, where the construct being repeated
931 // compiled to single opcode, and might be optimizable.
932 if (topLoc == fRXPat->fCompiledPat->size() - 1) {
933 int32_t repeatedOp = (int32_t)fRXPat->fCompiledPat->elementAti(topLoc);
934
935 if (URX_TYPE(repeatedOp) == URX_SETREF) {
936 // Emit optimized code for a [char set]*
937 int32_t loopOpI = buildOp(URX_LOOP_SR_I, URX_VAL(repeatedOp));
938 fRXPat->fCompiledPat->setElementAt(loopOpI, topLoc);
939 dataLoc = allocateStackData(1);
940 appendOp(URX_LOOP_C, dataLoc);
941 break;
942 }
943
944 if (URX_TYPE(repeatedOp) == URX_DOTANY ||
945 URX_TYPE(repeatedOp) == URX_DOTANY_ALL ||
946 URX_TYPE(repeatedOp) == URX_DOTANY_UNIX) {
947 // Emit Optimized code for .* operations.
948 int32_t loopOpI = buildOp(URX_LOOP_DOT_I, 0);
949 if (URX_TYPE(repeatedOp) == URX_DOTANY_ALL) {
950 // URX_LOOP_DOT_I operand is a flag indicating . matches any mode.
951 loopOpI |= 1;
952 }
953 if ((fModeFlags & UREGEX_UNIX_LINES) != 0) {
954 loopOpI |= 2;
955 }
956 fRXPat->fCompiledPat->setElementAt(loopOpI, topLoc);
957 dataLoc = allocateStackData(1);
958 appendOp(URX_LOOP_C, dataLoc);
959 break;
960 }
961 }
962
963 // Emit general case code for this *
964 // The optimizations did not apply.
965
966 int32_t saveStateLoc = blockTopLoc(TRUE);
967 int32_t jmpOp = buildOp(URX_JMP_SAV, saveStateLoc+1);
968
969 // Check for minimum match length of zero, which requires
970 // extra loop-breaking code.
971 if (minMatchLength(saveStateLoc, fRXPat->fCompiledPat->size()-1) == 0) {
972 insertOp(saveStateLoc);
973 dataLoc = allocateStackData(1);
974
975 int32_t op = buildOp(URX_STO_INP_LOC, dataLoc);
976 fRXPat->fCompiledPat->setElementAt(op, saveStateLoc+1);
977 jmpOp = buildOp(URX_JMP_SAV_X, saveStateLoc+2);
978 }
979
980 // Locate the position in the compiled pattern where the match will continue
981 // after completing the *. (4 or 5 in the comment above)
982 int32_t continueLoc = fRXPat->fCompiledPat->size()+1;
983
984 // Put together the save state op and store it into the compiled code.
985 int32_t saveStateOp = buildOp(URX_STATE_SAVE, continueLoc);
986 fRXPat->fCompiledPat->setElementAt(saveStateOp, saveStateLoc);
987
988 // Append the URX_JMP_SAV or URX_JMPX operation to the compiled pattern.
989 appendOp(jmpOp);
990 }
991 break;
992
993 case doNGStar:
994 // Non-greedy *? quantifier
995 // compiles to
996 // 1. JMP 3
997 // 2. body of stuff being iterated over
998 // 3. STATE_SAVE 2
999 // 4 ...
1000 {
1001 int32_t jmpLoc = blockTopLoc(TRUE); // loc 1.
1002 int32_t saveLoc = fRXPat->fCompiledPat->size(); // loc 3.
1003 int32_t jmpOp = buildOp(URX_JMP, saveLoc);
1004 fRXPat->fCompiledPat->setElementAt(jmpOp, jmpLoc);
1005 appendOp(URX_STATE_SAVE, jmpLoc+1);
1006 }
1007 break;
1008
1009
1010 case doIntervalInit:
1011 // The '{' opening an interval quantifier was just scanned.
1012 // Init the counter varaiables that will accumulate the values as the digits
1013 // are scanned.
1014 fIntervalLow = 0;
1015 fIntervalUpper = -1;
1016 break;
1017
1018 case doIntevalLowerDigit:
1019 // Scanned a digit from the lower value of an {lower,upper} interval
1020 {
1021 int32_t digitValue = u_charDigitValue(fC.fChar);
1022 U_ASSERT(digitValue >= 0);
1023 int64_t val = (int64_t)fIntervalLow*10 + digitValue;
1024 if (val > INT32_MAX) {
1025 error(U_REGEX_NUMBER_TOO_BIG);
1026 } else {
1027 fIntervalLow = (int32_t)val;
1028 }
1029 }
1030 break;
1031
1032 case doIntervalUpperDigit:
1033 // Scanned a digit from the upper value of an {lower,upper} interval
1034 {
1035 if (fIntervalUpper < 0) {
1036 fIntervalUpper = 0;
1037 }
1038 int32_t digitValue = u_charDigitValue(fC.fChar);
1039 U_ASSERT(digitValue >= 0);
1040 int64_t val = (int64_t)fIntervalUpper*10 + digitValue;
1041 if (val > INT32_MAX) {
1042 error(U_REGEX_NUMBER_TOO_BIG);
1043 } else {
1044 fIntervalUpper = (int32_t)val;
1045 }
1046 }
1047 break;
1048
1049 case doIntervalSame:
1050 // Scanned a single value interval like {27}. Upper = Lower.
1051 fIntervalUpper = fIntervalLow;
1052 break;
1053
1054 case doInterval:
1055 // Finished scanning a normal {lower,upper} interval. Generate the code for it.
1056 if (compileInlineInterval() == FALSE) {
1057 compileInterval(URX_CTR_INIT, URX_CTR_LOOP);
1058 }
1059 break;
1060
1061 case doPossessiveInterval:
1062 // Finished scanning a Possessive {lower,upper}+ interval. Generate the code for it.
1063 {
1064 // Remember the loc for the top of the block being looped over.
1065 // (Can not reserve a slot in the compiled pattern at this time, because
1066 // compileInterval needs to reserve also, and blockTopLoc can only reserve
1067 // once per block.)
1068 int32_t topLoc = blockTopLoc(FALSE);
1069
1070 // Produce normal looping code.
1071 compileInterval(URX_CTR_INIT, URX_CTR_LOOP);
1072
1073 // Surround the just-emitted normal looping code with a STO_SP ... LD_SP
1074 // just as if the loop was inclosed in atomic parentheses.
1075
1076 // First the STO_SP before the start of the loop
1077 insertOp(topLoc);
1078
1079 int32_t varLoc = allocateData(1); // Reserve a data location for saving the
1080 int32_t op = buildOp(URX_STO_SP, varLoc);
1081 fRXPat->fCompiledPat->setElementAt(op, topLoc);
1082
1083 int32_t loopOp = (int32_t)fRXPat->fCompiledPat->popi();
1084 U_ASSERT(URX_TYPE(loopOp) == URX_CTR_LOOP && URX_VAL(loopOp) == topLoc);
1085 loopOp++; // point LoopOp after the just-inserted STO_SP
1086 fRXPat->fCompiledPat->push(loopOp, *fStatus);
1087
1088 // Then the LD_SP after the end of the loop
1089 appendOp(URX_LD_SP, varLoc);
1090 }
1091
1092 break;
1093
1094 case doNGInterval:
1095 // Finished scanning a non-greedy {lower,upper}? interval. Generate the code for it.
1096 compileInterval(URX_CTR_INIT_NG, URX_CTR_LOOP_NG);
1097 break;
1098
1099 case doIntervalError:
1100 error(U_REGEX_BAD_INTERVAL);
1101 break;
1102
1103 case doLiteralChar:
1104 // We've just scanned a "normal" character from the pattern,
1105 literalChar(fC.fChar);
1106 break;
1107
1108
1109 case doEscapedLiteralChar:
1110 // We've just scanned an backslashed escaped character with no
1111 // special meaning. It represents itself.
1112 if ((fModeFlags & UREGEX_ERROR_ON_UNKNOWN_ESCAPES) != 0 &&
1113 ((fC.fChar >= 0x41 && fC.fChar<= 0x5A) || // in [A-Z]
1114 (fC.fChar >= 0x61 && fC.fChar <= 0x7a))) { // in [a-z]
1115 error(U_REGEX_BAD_ESCAPE_SEQUENCE);
1116 }
1117 literalChar(fC.fChar);
1118 break;
1119
1120
1121 case doDotAny:
1122 // scanned a ".", match any single character.
1123 {
1124 fixLiterals(FALSE);
1125 if (fModeFlags & UREGEX_DOTALL) {
1126 appendOp(URX_DOTANY_ALL, 0);
1127 } else if (fModeFlags & UREGEX_UNIX_LINES) {
1128 appendOp(URX_DOTANY_UNIX, 0);
1129 } else {
1130 appendOp(URX_DOTANY, 0);
1131 }
1132 }
1133 break;
1134
1135 case doCaret:
1136 {
1137 fixLiterals(FALSE);
1138 if ( (fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) {
1139 appendOp(URX_CARET, 0);
1140 } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) {
1141 appendOp(URX_CARET_M, 0);
1142 } else if ((fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) {
1143 appendOp(URX_CARET, 0); // Only testing true start of input.
1144 } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) {
1145 appendOp(URX_CARET_M_UNIX, 0);
1146 }
1147 }
1148 break;
1149
1150 case doDollar:
1151 {
1152 fixLiterals(FALSE);
1153 if ( (fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) {
1154 appendOp(URX_DOLLAR, 0);
1155 } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) {
1156 appendOp(URX_DOLLAR_M, 0);
1157 } else if ((fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) {
1158 appendOp(URX_DOLLAR_D, 0);
1159 } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) {
1160 appendOp(URX_DOLLAR_MD, 0);
1161 }
1162 }
1163 break;
1164
1165 case doBackslashA:
1166 fixLiterals(FALSE);
1167 appendOp(URX_CARET, 0);
1168 break;
1169
1170 case doBackslashB:
1171 {
1172 #if UCONFIG_NO_BREAK_ITERATION==1
1173 if (fModeFlags & UREGEX_UWORD) {
1174 error(U_UNSUPPORTED_ERROR);
1175 }
1176 #endif
1177 fixLiterals(FALSE);
1178 int32_t op = (fModeFlags & UREGEX_UWORD)? URX_BACKSLASH_BU : URX_BACKSLASH_B;
1179 appendOp(op, 1);
1180 }
1181 break;
1182
1183 case doBackslashb:
1184 {
1185 #if UCONFIG_NO_BREAK_ITERATION==1
1186 if (fModeFlags & UREGEX_UWORD) {
1187 error(U_UNSUPPORTED_ERROR);
1188 }
1189 #endif
1190 fixLiterals(FALSE);
1191 int32_t op = (fModeFlags & UREGEX_UWORD)? URX_BACKSLASH_BU : URX_BACKSLASH_B;
1192 appendOp(op, 0);
1193 }
1194 break;
1195
1196 case doBackslashD:
1197 fixLiterals(FALSE);
1198 appendOp(URX_BACKSLASH_D, 1);
1199 break;
1200
1201 case doBackslashd:
1202 fixLiterals(FALSE);
1203 appendOp(URX_BACKSLASH_D, 0);
1204 break;
1205
1206 case doBackslashG:
1207 fixLiterals(FALSE);
1208 appendOp(URX_BACKSLASH_G, 0);
1209 break;
1210
1211 case doBackslashH:
1212 fixLiterals(FALSE);
1213 appendOp(URX_BACKSLASH_H, 1);
1214 break;
1215
1216 case doBackslashh:
1217 fixLiterals(FALSE);
1218 appendOp(URX_BACKSLASH_H, 0);
1219 break;
1220
1221 case doBackslashR:
1222 fixLiterals(FALSE);
1223 appendOp(URX_BACKSLASH_R, 0);
1224 break;
1225
1226 case doBackslashS:
1227 fixLiterals(FALSE);
1228 appendOp(URX_STAT_SETREF_N, URX_ISSPACE_SET);
1229 break;
1230
1231 case doBackslashs:
1232 fixLiterals(FALSE);
1233 appendOp(URX_STATIC_SETREF, URX_ISSPACE_SET);
1234 break;
1235
1236 case doBackslashV:
1237 fixLiterals(FALSE);
1238 appendOp(URX_BACKSLASH_V, 1);
1239 break;
1240
1241 case doBackslashv:
1242 fixLiterals(FALSE);
1243 appendOp(URX_BACKSLASH_V, 0);
1244 break;
1245
1246 case doBackslashW:
1247 fixLiterals(FALSE);
1248 appendOp(URX_STAT_SETREF_N, URX_ISWORD_SET);
1249 break;
1250
1251 case doBackslashw:
1252 fixLiterals(FALSE);
1253 appendOp(URX_STATIC_SETREF, URX_ISWORD_SET);
1254 break;
1255
1256 case doBackslashX:
1257 #if UCONFIG_NO_BREAK_ITERATION==1
1258 // Grapheme Cluster Boundary requires ICU break iteration.
1259 error(U_UNSUPPORTED_ERROR);
1260 #endif
1261 fixLiterals(FALSE);
1262 appendOp(URX_BACKSLASH_X, 0);
1263 break;
1264
1265 case doBackslashZ:
1266 fixLiterals(FALSE);
1267 appendOp(URX_DOLLAR, 0);
1268 break;
1269
1270 case doBackslashz:
1271 fixLiterals(FALSE);
1272 appendOp(URX_BACKSLASH_Z, 0);
1273 break;
1274
1275 case doEscapeError:
1276 error(U_REGEX_BAD_ESCAPE_SEQUENCE);
1277 break;
1278
1279 case doExit:
1280 fixLiterals(FALSE);
1281 returnVal = FALSE;
1282 break;
1283
1284 case doProperty:
1285 {
1286 fixLiterals(FALSE);
1287 UnicodeSet *theSet = scanProp();
1288 compileSet(theSet);
1289 }
1290 break;
1291
1292 case doNamedChar:
1293 {
1294 UChar32 c = scanNamedChar();
1295 literalChar(c);
1296 }
1297 break;
1298
1299
1300 case doBackRef:
1301 // BackReference. Somewhat unusual in that the front-end can not completely parse
1302 // the regular expression, because the number of digits to be consumed
1303 // depends on the number of capture groups that have been defined. So
1304 // we have to do it here instead.
1305 {
1306 int32_t numCaptureGroups = fRXPat->fGroupMap->size();
1307 int32_t groupNum = 0;
1308 UChar32 c = fC.fChar;
1309
1310 for (;;) {
1311 // Loop once per digit, for max allowed number of digits in a back reference.
1312 int32_t digit = u_charDigitValue(c);
1313 groupNum = groupNum * 10 + digit;
1314 if (groupNum >= numCaptureGroups) {
1315 break;
1316 }
1317 c = peekCharLL();
1318 if (RegexStaticSets::gStaticSets->fRuleDigitsAlias->contains(c) == FALSE) {
1319 break;
1320 }
1321 nextCharLL();
1322 }
1323
1324 // Scan of the back reference in the source regexp is complete. Now generate
1325 // the compiled code for it.
1326 // Because capture groups can be forward-referenced by back-references,
1327 // we fill the operand with the capture group number. At the end
1328 // of compilation, it will be changed to the variable's location.
1329 U_ASSERT(groupNum > 0); // Shouldn't happen. '\0' begins an octal escape sequence,
1330 // and shouldn't enter this code path at all.
1331 fixLiterals(FALSE);
1332 if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
1333 appendOp(URX_BACKREF_I, groupNum);
1334 } else {
1335 appendOp(URX_BACKREF, groupNum);
1336 }
1337 }
1338 break;
1339
1340 case doBeginNamedBackRef:
1341 U_ASSERT(fCaptureName == NULL);
1342 fCaptureName = new UnicodeString;
1343 if (fCaptureName == NULL) {
1344 error(U_MEMORY_ALLOCATION_ERROR);
1345 }
1346 break;
1347
1348 case doContinueNamedBackRef:
1349 fCaptureName->append(fC.fChar);
1350 break;
1351
1352 case doCompleteNamedBackRef:
1353 {
1354 int32_t groupNumber =
1355 fRXPat->fNamedCaptureMap ? uhash_geti(fRXPat->fNamedCaptureMap, fCaptureName) : 0;
1356 if (groupNumber == 0) {
1357 // Group name has not been defined.
1358 // Could be a forward reference. If we choose to support them at some
1359 // future time, extra mechanism will be required at this point.
1360 error(U_REGEX_INVALID_CAPTURE_GROUP_NAME);
1361 } else {
1362 // Given the number, handle identically to a \n numbered back reference.
1363 // See comments above, under doBackRef
1364 fixLiterals(FALSE);
1365 if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
1366 appendOp(URX_BACKREF_I, groupNumber);
1367 } else {
1368 appendOp(URX_BACKREF, groupNumber);
1369 }
1370 }
1371 delete fCaptureName;
1372 fCaptureName = NULL;
1373 break;
1374 }
1375
1376 case doPossessivePlus:
1377 // Possessive ++ quantifier.
1378 // Compiles to
1379 // 1. STO_SP
1380 // 2. body of stuff being iterated over
1381 // 3. STATE_SAVE 5
1382 // 4. JMP 2
1383 // 5. LD_SP
1384 // 6. ...
1385 //
1386 // Note: TODO: This is pretty inefficient. A mass of saved state is built up
1387 // then unconditionally discarded. Perhaps introduce a new opcode. Ticket 6056
1388 //
1389 {
1390 // Emit the STO_SP
1391 int32_t topLoc = blockTopLoc(TRUE);
1392 int32_t stoLoc = allocateData(1); // Reserve the data location for storing save stack ptr.
1393 int32_t op = buildOp(URX_STO_SP, stoLoc);
1394 fRXPat->fCompiledPat->setElementAt(op, topLoc);
1395
1396 // Emit the STATE_SAVE
1397 appendOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+2);
1398
1399 // Emit the JMP
1400 appendOp(URX_JMP, topLoc+1);
1401
1402 // Emit the LD_SP
1403 appendOp(URX_LD_SP, stoLoc);
1404 }
1405 break;
1406
1407 case doPossessiveStar:
1408 // Possessive *+ quantifier.
1409 // Compiles to
1410 // 1. STO_SP loc
1411 // 2. STATE_SAVE 5
1412 // 3. body of stuff being iterated over
1413 // 4. JMP 2
1414 // 5. LD_SP loc
1415 // 6 ...
1416 // TODO: do something to cut back the state stack each time through the loop.
1417 {
1418 // Reserve two slots at the top of the block.
1419 int32_t topLoc = blockTopLoc(TRUE);
1420 insertOp(topLoc);
1421
1422 // emit STO_SP loc
1423 int32_t stoLoc = allocateData(1); // Reserve the data location for storing save stack ptr.
1424 int32_t op = buildOp(URX_STO_SP, stoLoc);
1425 fRXPat->fCompiledPat->setElementAt(op, topLoc);
1426
1427 // Emit the SAVE_STATE 5
1428 int32_t L7 = fRXPat->fCompiledPat->size()+1;
1429 op = buildOp(URX_STATE_SAVE, L7);
1430 fRXPat->fCompiledPat->setElementAt(op, topLoc+1);
1431
1432 // Append the JMP operation.
1433 appendOp(URX_JMP, topLoc+1);
1434
1435 // Emit the LD_SP loc
1436 appendOp(URX_LD_SP, stoLoc);
1437 }
1438 break;
1439
1440 case doPossessiveOpt:
1441 // Possessive ?+ quantifier.
1442 // Compiles to
1443 // 1. STO_SP loc
1444 // 2. SAVE_STATE 5
1445 // 3. body of optional block
1446 // 4. LD_SP loc
1447 // 5. ...
1448 //
1449 {
1450 // Reserve two slots at the top of the block.
1451 int32_t topLoc = blockTopLoc(TRUE);
1452 insertOp(topLoc);
1453
1454 // Emit the STO_SP
1455 int32_t stoLoc = allocateData(1); // Reserve the data location for storing save stack ptr.
1456 int32_t op = buildOp(URX_STO_SP, stoLoc);
1457 fRXPat->fCompiledPat->setElementAt(op, topLoc);
1458
1459 // Emit the SAVE_STATE
1460 int32_t continueLoc = fRXPat->fCompiledPat->size()+1;
1461 op = buildOp(URX_STATE_SAVE, continueLoc);
1462 fRXPat->fCompiledPat->setElementAt(op, topLoc+1);
1463
1464 // Emit the LD_SP
1465 appendOp(URX_LD_SP, stoLoc);
1466 }
1467 break;
1468
1469
1470 case doBeginMatchMode:
1471 fNewModeFlags = fModeFlags;
1472 fSetModeFlag = TRUE;
1473 break;
1474
1475 case doMatchMode: // (?i) and similar
1476 {
1477 int32_t bit = 0;
1478 switch (fC.fChar) {
1479 case 0x69: /* 'i' */ bit = UREGEX_CASE_INSENSITIVE; break;
1480 case 0x64: /* 'd' */ bit = UREGEX_UNIX_LINES; break;
1481 case 0x6d: /* 'm' */ bit = UREGEX_MULTILINE; break;
1482 case 0x73: /* 's' */ bit = UREGEX_DOTALL; break;
1483 case 0x75: /* 'u' */ bit = 0; /* Unicode casing */ break;
1484 case 0x77: /* 'w' */ bit = UREGEX_UWORD; break;
1485 case 0x78: /* 'x' */ bit = UREGEX_COMMENTS; break;
1486 case 0x2d: /* '-' */ fSetModeFlag = FALSE; break;
1487 default:
1488 UPRV_UNREACHABLE; // Should never happen. Other chars are filtered out
1489 // by the scanner.
1490 }
1491 if (fSetModeFlag) {
1492 fNewModeFlags |= bit;
1493 } else {
1494 fNewModeFlags &= ~bit;
1495 }
1496 }
1497 break;
1498
1499 case doSetMatchMode:
1500 // Emit code to match any pending literals, using the not-yet changed match mode.
1501 fixLiterals();
1502
1503 // We've got a (?i) or similar. The match mode is being changed, but
1504 // the change is not scoped to a parenthesized block.
1505 U_ASSERT(fNewModeFlags < 0);
1506 fModeFlags = fNewModeFlags;
1507
1508 break;
1509
1510
1511 case doMatchModeParen:
1512 // We've got a (?i: or similar. Begin a parenthesized block, save old
1513 // mode flags so they can be restored at the close of the block.
1514 //
1515 // Compile to a
1516 // - NOP, which later may be replaced by a save-state if the
1517 // parenthesized group gets a * quantifier, followed by
1518 // - NOP, which may later be replaced by a save-state if there
1519 // is an '|' alternation within the parens.
1520 {
1521 fixLiterals(FALSE);
1522 appendOp(URX_NOP, 0);
1523 appendOp(URX_NOP, 0);
1524
1525 // On the Parentheses stack, start a new frame and add the postions
1526 // of the two NOPs (a normal non-capturing () frame, except for the
1527 // saving of the orignal mode flags.)
1528 fParenStack.push(fModeFlags, *fStatus);
1529 fParenStack.push(flags, *fStatus); // Frame Marker
1530 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The first NOP
1531 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP
1532
1533 // Set the current mode flags to the new values.
1534 U_ASSERT(fNewModeFlags < 0);
1535 fModeFlags = fNewModeFlags;
1536 }
1537 break;
1538
1539 case doBadModeFlag:
1540 error(U_REGEX_INVALID_FLAG);
1541 break;
1542
1543 case doSuppressComments:
1544 // We have just scanned a '(?'. We now need to prevent the character scanner from
1545 // treating a '#' as a to-the-end-of-line comment.
1546 // (This Perl compatibility just gets uglier and uglier to do...)
1547 fEOLComments = FALSE;
1548 break;
1549
1550
1551 case doSetAddAmp:
1552 {
1553 UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1554 set->add(chAmp);
1555 }
1556 break;
1557
1558 case doSetAddDash:
1559 {
1560 UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1561 set->add(chDash);
1562 }
1563 break;
1564
1565 case doSetBackslash_s:
1566 {
1567 UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1568 set->addAll(RegexStaticSets::gStaticSets->fPropSets[URX_ISSPACE_SET]);
1569 break;
1570 }
1571
1572 case doSetBackslash_S:
1573 {
1574 UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1575 UnicodeSet SSet;
1576 SSet.addAll(RegexStaticSets::gStaticSets->fPropSets[URX_ISSPACE_SET]).complement();
1577 set->addAll(SSet);
1578 break;
1579 }
1580
1581 case doSetBackslash_d:
1582 {
1583 UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1584 // TODO - make a static set, ticket 6058.
1585 addCategory(set, U_GC_ND_MASK, *fStatus);
1586 break;
1587 }
1588
1589 case doSetBackslash_D:
1590 {
1591 UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1592 UnicodeSet digits;
1593 // TODO - make a static set, ticket 6058.
1594 digits.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ND_MASK, *fStatus);
1595 digits.complement();
1596 set->addAll(digits);
1597 break;
1598 }
1599
1600 case doSetBackslash_h:
1601 {
1602 UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1603 UnicodeSet h;
1604 h.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ZS_MASK, *fStatus);
1605 h.add((UChar32)9); // Tab
1606 set->addAll(h);
1607 break;
1608 }
1609
1610 case doSetBackslash_H:
1611 {
1612 UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1613 UnicodeSet h;
1614 h.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ZS_MASK, *fStatus);
1615 h.add((UChar32)9); // Tab
1616 h.complement();
1617 set->addAll(h);
1618 break;
1619 }
1620
1621 case doSetBackslash_v:
1622 {
1623 UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1624 set->add((UChar32)0x0a, (UChar32)0x0d); // add range
1625 set->add((UChar32)0x85);
1626 set->add((UChar32)0x2028, (UChar32)0x2029);
1627 break;
1628 }
1629
1630 case doSetBackslash_V:
1631 {
1632 UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1633 UnicodeSet v;
1634 v.add((UChar32)0x0a, (UChar32)0x0d); // add range
1635 v.add((UChar32)0x85);
1636 v.add((UChar32)0x2028, (UChar32)0x2029);
1637 v.complement();
1638 set->addAll(v);
1639 break;
1640 }
1641
1642 case doSetBackslash_w:
1643 {
1644 UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1645 set->addAll(RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET]);
1646 break;
1647 }
1648
1649 case doSetBackslash_W:
1650 {
1651 UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1652 UnicodeSet SSet;
1653 SSet.addAll(RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET]).complement();
1654 set->addAll(SSet);
1655 break;
1656 }
1657
1658 case doSetBegin:
1659 fixLiterals(FALSE);
1660 fSetStack.push(new UnicodeSet(), *fStatus);
1661 fSetOpStack.push(setStart, *fStatus);
1662 if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) {
1663 fSetOpStack.push(setCaseClose, *fStatus);
1664 }
1665 break;
1666
1667 case doSetBeginDifference1:
1668 // We have scanned something like [[abc]-[
1669 // Set up a new UnicodeSet for the set beginning with the just-scanned '['
1670 // Push a Difference operator, which will cause the new set to be subtracted from what
1671 // went before once it is created.
1672 setPushOp(setDifference1);
1673 fSetOpStack.push(setStart, *fStatus);
1674 if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) {
1675 fSetOpStack.push(setCaseClose, *fStatus);
1676 }
1677 break;
1678
1679 case doSetBeginIntersection1:
1680 // We have scanned something like [[abc]&[
1681 // Need both the '&' operator and the open '[' operator.
1682 setPushOp(setIntersection1);
1683 fSetOpStack.push(setStart, *fStatus);
1684 if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) {
1685 fSetOpStack.push(setCaseClose, *fStatus);
1686 }
1687 break;
1688
1689 case doSetBeginUnion:
1690 // We have scanned something like [[abc][
1691 // Need to handle the union operation explicitly [[abc] | [
1692 setPushOp(setUnion);
1693 fSetOpStack.push(setStart, *fStatus);
1694 if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) {
1695 fSetOpStack.push(setCaseClose, *fStatus);
1696 }
1697 break;
1698
1699 case doSetDifference2:
1700 // We have scanned something like [abc--
1701 // Consider this to unambiguously be a set difference operator.
1702 setPushOp(setDifference2);
1703 break;
1704
1705 case doSetEnd:
1706 // Have encountered the ']' that closes a set.
1707 // Force the evaluation of any pending operations within this set,
1708 // leave the completed set on the top of the set stack.
1709 setEval(setEnd);
1710 U_ASSERT(fSetOpStack.peeki()==setStart);
1711 fSetOpStack.popi();
1712 break;
1713
1714 case doSetFinish:
1715 {
1716 // Finished a complete set expression, including all nested sets.
1717 // The close bracket has already triggered clearing out pending set operators,
1718 // the operator stack should be empty and the operand stack should have just
1719 // one entry, the result set.
1720 U_ASSERT(fSetOpStack.empty());
1721 UnicodeSet *theSet = (UnicodeSet *)fSetStack.pop();
1722 U_ASSERT(fSetStack.empty());
1723 compileSet(theSet);
1724 break;
1725 }
1726
1727 case doSetIntersection2:
1728 // Have scanned something like [abc&&
1729 setPushOp(setIntersection2);
1730 break;
1731
1732 case doSetLiteral:
1733 // Union the just-scanned literal character into the set being built.
1734 // This operation is the highest precedence set operation, so we can always do
1735 // it immediately, without waiting to see what follows. It is necessary to perform
1736 // any pending '-' or '&' operation first, because these have the same precedence
1737 // as union-ing in a literal'
1738 {
1739 setEval(setUnion);
1740 UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
1741 s->add(fC.fChar);
1742 fLastSetLiteral = fC.fChar;
1743 break;
1744 }
1745
1746 case doSetLiteralEscaped:
1747 // A back-slash escaped literal character was encountered.
1748 // Processing is the same as with setLiteral, above, with the addition of
1749 // the optional check for errors on escaped ASCII letters.
1750 {
1751 if ((fModeFlags & UREGEX_ERROR_ON_UNKNOWN_ESCAPES) != 0 &&
1752 ((fC.fChar >= 0x41 && fC.fChar<= 0x5A) || // in [A-Z]
1753 (fC.fChar >= 0x61 && fC.fChar <= 0x7a))) { // in [a-z]
1754 error(U_REGEX_BAD_ESCAPE_SEQUENCE);
1755 }
1756 setEval(setUnion);
1757 UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
1758 s->add(fC.fChar);
1759 fLastSetLiteral = fC.fChar;
1760 break;
1761 }
1762
1763 case doSetNamedChar:
1764 // Scanning a \N{UNICODE CHARACTER NAME}
1765 // Aside from the source of the character, the processing is identical to doSetLiteral,
1766 // above.
1767 {
1768 UChar32 c = scanNamedChar();
1769 setEval(setUnion);
1770 UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
1771 s->add(c);
1772 fLastSetLiteral = c;
1773 break;
1774 }
1775
1776 case doSetNamedRange:
1777 // We have scanned literal-\N{CHAR NAME}. Add the range to the set.
1778 // The left character is already in the set, and is saved in fLastSetLiteral.
1779 // The right side needs to be picked up, the scan is at the 'N'.
1780 // Lower Limit > Upper limit being an error matches both Java
1781 // and ICU UnicodeSet behavior.
1782 {
1783 UChar32 c = scanNamedChar();
1784 if (U_SUCCESS(*fStatus) && (fLastSetLiteral == U_SENTINEL || fLastSetLiteral > c)) {
1785 error(U_REGEX_INVALID_RANGE);
1786 }
1787 UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
1788 s->add(fLastSetLiteral, c);
1789 fLastSetLiteral = c;
1790 break;
1791 }
1792
1793
1794 case doSetNegate:
1795 // Scanned a '^' at the start of a set.
1796 // Push the negation operator onto the set op stack.
1797 // A twist for case-insensitive matching:
1798 // the case closure operation must happen _before_ negation.
1799 // But the case closure operation will already be on the stack if it's required.
1800 // This requires checking for case closure, and swapping the stack order
1801 // if it is present.
1802 {
1803 int32_t tosOp = fSetOpStack.peeki();
1804 if (tosOp == setCaseClose) {
1805 fSetOpStack.popi();
1806 fSetOpStack.push(setNegation, *fStatus);
1807 fSetOpStack.push(setCaseClose, *fStatus);
1808 } else {
1809 fSetOpStack.push(setNegation, *fStatus);
1810 }
1811 }
1812 break;
1813
1814 case doSetNoCloseError:
1815 error(U_REGEX_MISSING_CLOSE_BRACKET);
1816 break;
1817
1818 case doSetOpError:
1819 error(U_REGEX_RULE_SYNTAX); // -- or && at the end of a set. Illegal.
1820 break;
1821
1822 case doSetPosixProp:
1823 {
1824 UnicodeSet *s = scanPosixProp();
1825 if (s != NULL) {
1826 UnicodeSet *tos = (UnicodeSet *)fSetStack.peek();
1827 tos->addAll(*s);
1828 delete s;
1829 } // else error. scanProp() reported the error status already.
1830 }
1831 break;
1832
1833 case doSetProp:
1834 // Scanned a \p \P within [brackets].
1835 {
1836 UnicodeSet *s = scanProp();
1837 if (s != NULL) {
1838 UnicodeSet *tos = (UnicodeSet *)fSetStack.peek();
1839 tos->addAll(*s);
1840 delete s;
1841 } // else error. scanProp() reported the error status already.
1842 }
1843 break;
1844
1845
1846 case doSetRange:
1847 // We have scanned literal-literal. Add the range to the set.
1848 // The left character is already in the set, and is saved in fLastSetLiteral.
1849 // The right side is the current character.
1850 // Lower Limit > Upper limit being an error matches both Java
1851 // and ICU UnicodeSet behavior.
1852 {
1853
1854 if (fLastSetLiteral == U_SENTINEL || fLastSetLiteral > fC.fChar) {
1855 error(U_REGEX_INVALID_RANGE);
1856 }
1857 UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
1858 s->add(fLastSetLiteral, fC.fChar);
1859 break;
1860 }
1861
1862 default:
1863 UPRV_UNREACHABLE;
1864 }
1865
1866 if (U_FAILURE(*fStatus)) {
1867 returnVal = FALSE;
1868 }
1869
1870 return returnVal;
1871 }
1872
1873
1874
1875 //------------------------------------------------------------------------------
1876 //
1877 // literalChar We've encountered a literal character from the pattern,
1878 // or an escape sequence that reduces to a character.
1879 // Add it to the string containing all literal chars/strings from
1880 // the pattern.
1881 //
1882 //------------------------------------------------------------------------------
literalChar(UChar32 c)1883 void RegexCompile::literalChar(UChar32 c) {
1884 fLiteralChars.append(c);
1885 }
1886
1887
1888 //------------------------------------------------------------------------------
1889 //
1890 // fixLiterals When compiling something that can follow a literal
1891 // string in a pattern, emit the code to match the
1892 // accumulated literal string.
1893 //
1894 // Optionally, split the last char of the string off into
1895 // a single "ONE_CHAR" operation, so that quantifiers can
1896 // apply to that char alone. Example: abc*
1897 // The * must apply to the 'c' only.
1898 //
1899 //------------------------------------------------------------------------------
fixLiterals(UBool split)1900 void RegexCompile::fixLiterals(UBool split) {
1901
1902 // If no literal characters have been scanned but not yet had code generated
1903 // for them, nothing needs to be done.
1904 if (fLiteralChars.length() == 0) {
1905 return;
1906 }
1907
1908 int32_t indexOfLastCodePoint = fLiteralChars.moveIndex32(fLiteralChars.length(), -1);
1909 UChar32 lastCodePoint = fLiteralChars.char32At(indexOfLastCodePoint);
1910
1911 // Split: We need to ensure that the last item in the compiled pattern
1912 // refers only to the last literal scanned in the pattern, so that
1913 // quantifiers (*, +, etc.) affect only it, and not a longer string.
1914 // Split before case folding for case insensitive matches.
1915
1916 if (split) {
1917 fLiteralChars.truncate(indexOfLastCodePoint);
1918 fixLiterals(FALSE); // Recursive call, emit code to match the first part of the string.
1919 // Note that the truncated literal string may be empty, in which case
1920 // nothing will be emitted.
1921
1922 literalChar(lastCodePoint); // Re-add the last code point as if it were a new literal.
1923 fixLiterals(FALSE); // Second recursive call, code for the final code point.
1924 return;
1925 }
1926
1927 // If we are doing case-insensitive matching, case fold the string. This may expand
1928 // the string, e.g. the German sharp-s turns into "ss"
1929 if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
1930 fLiteralChars.foldCase();
1931 indexOfLastCodePoint = fLiteralChars.moveIndex32(fLiteralChars.length(), -1);
1932 lastCodePoint = fLiteralChars.char32At(indexOfLastCodePoint);
1933 }
1934
1935 if (indexOfLastCodePoint == 0) {
1936 // Single character, emit a URX_ONECHAR op to match it.
1937 if ((fModeFlags & UREGEX_CASE_INSENSITIVE) &&
1938 u_hasBinaryProperty(lastCodePoint, UCHAR_CASE_SENSITIVE)) {
1939 appendOp(URX_ONECHAR_I, lastCodePoint);
1940 } else {
1941 appendOp(URX_ONECHAR, lastCodePoint);
1942 }
1943 } else {
1944 // Two or more chars, emit a URX_STRING to match them.
1945 if (fLiteralChars.length() > 0x00ffffff || fRXPat->fLiteralText.length() > 0x00ffffff) {
1946 error(U_REGEX_PATTERN_TOO_BIG);
1947 }
1948 if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
1949 appendOp(URX_STRING_I, fRXPat->fLiteralText.length());
1950 } else {
1951 // TODO here: add optimization to split case sensitive strings of length two
1952 // into two single char ops, for efficiency.
1953 appendOp(URX_STRING, fRXPat->fLiteralText.length());
1954 }
1955 appendOp(URX_STRING_LEN, fLiteralChars.length());
1956
1957 // Add this string into the accumulated strings of the compiled pattern.
1958 fRXPat->fLiteralText.append(fLiteralChars);
1959 }
1960
1961 fLiteralChars.remove();
1962 }
1963
1964
buildOp(int32_t type,int32_t val)1965 int32_t RegexCompile::buildOp(int32_t type, int32_t val) {
1966 if (U_FAILURE(*fStatus)) {
1967 return 0;
1968 }
1969 if (type < 0 || type > 255) {
1970 UPRV_UNREACHABLE;
1971 }
1972 if (val > 0x00ffffff) {
1973 UPRV_UNREACHABLE;
1974 }
1975 if (val < 0) {
1976 if (!(type == URX_RESERVED_OP_N || type == URX_RESERVED_OP)) {
1977 UPRV_UNREACHABLE;
1978 }
1979 if (URX_TYPE(val) != 0xff) {
1980 UPRV_UNREACHABLE;
1981 }
1982 type = URX_RESERVED_OP_N;
1983 }
1984 return (type << 24) | val;
1985 }
1986
1987
1988 //------------------------------------------------------------------------------
1989 //
1990 // appendOp() Append a new instruction onto the compiled pattern
1991 // Includes error checking, limiting the size of the
1992 // pattern to lengths that can be represented in the
1993 // 24 bit operand field of an instruction.
1994 //
1995 //------------------------------------------------------------------------------
appendOp(int32_t op)1996 void RegexCompile::appendOp(int32_t op) {
1997 if (U_FAILURE(*fStatus)) {
1998 return;
1999 }
2000 fRXPat->fCompiledPat->addElement(op, *fStatus);
2001 if ((fRXPat->fCompiledPat->size() > 0x00fffff0) && U_SUCCESS(*fStatus)) {
2002 error(U_REGEX_PATTERN_TOO_BIG);
2003 }
2004 }
2005
appendOp(int32_t type,int32_t val)2006 void RegexCompile::appendOp(int32_t type, int32_t val) {
2007 appendOp(buildOp(type, val));
2008 }
2009
2010
2011 //------------------------------------------------------------------------------
2012 //
2013 // insertOp() Insert a slot for a new opcode into the already
2014 // compiled pattern code.
2015 //
2016 // Fill the slot with a NOP. Our caller will replace it
2017 // with what they really wanted.
2018 //
2019 //------------------------------------------------------------------------------
insertOp(int32_t where)2020 void RegexCompile::insertOp(int32_t where) {
2021 UVector64 *code = fRXPat->fCompiledPat;
2022 U_ASSERT(where>0 && where < code->size());
2023
2024 int32_t nop = buildOp(URX_NOP, 0);
2025 code->insertElementAt(nop, where, *fStatus);
2026
2027 // Walk through the pattern, looking for any ops with targets that
2028 // were moved down by the insert. Fix them.
2029 int32_t loc;
2030 for (loc=0; loc<code->size(); loc++) {
2031 int32_t op = (int32_t)code->elementAti(loc);
2032 int32_t opType = URX_TYPE(op);
2033 int32_t opValue = URX_VAL(op);
2034 if ((opType == URX_JMP ||
2035 opType == URX_JMPX ||
2036 opType == URX_STATE_SAVE ||
2037 opType == URX_CTR_LOOP ||
2038 opType == URX_CTR_LOOP_NG ||
2039 opType == URX_JMP_SAV ||
2040 opType == URX_JMP_SAV_X ||
2041 opType == URX_RELOC_OPRND) && opValue > where) {
2042 // Target location for this opcode is after the insertion point and
2043 // needs to be incremented to adjust for the insertion.
2044 opValue++;
2045 op = buildOp(opType, opValue);
2046 code->setElementAt(op, loc);
2047 }
2048 }
2049
2050 // Now fix up the parentheses stack. All positive values in it are locations in
2051 // the compiled pattern. (Negative values are frame boundaries, and don't need fixing.)
2052 for (loc=0; loc<fParenStack.size(); loc++) {
2053 int32_t x = fParenStack.elementAti(loc);
2054 U_ASSERT(x < code->size());
2055 if (x>where) {
2056 x++;
2057 fParenStack.setElementAt(x, loc);
2058 }
2059 }
2060
2061 if (fMatchCloseParen > where) {
2062 fMatchCloseParen++;
2063 }
2064 if (fMatchOpenParen > where) {
2065 fMatchOpenParen++;
2066 }
2067 }
2068
2069
2070 //------------------------------------------------------------------------------
2071 //
2072 // allocateData() Allocate storage in the matcher's static data area.
2073 // Return the index for the newly allocated data.
2074 // The storage won't actually exist until we are running a match
2075 // operation, but the storage indexes are inserted into various
2076 // opcodes while compiling the pattern.
2077 //
2078 //------------------------------------------------------------------------------
allocateData(int32_t size)2079 int32_t RegexCompile::allocateData(int32_t size) {
2080 if (U_FAILURE(*fStatus)) {
2081 return 0;
2082 }
2083 if (size <= 0 || size > 0x100 || fRXPat->fDataSize < 0) {
2084 error(U_REGEX_INTERNAL_ERROR);
2085 return 0;
2086 }
2087 int32_t dataIndex = fRXPat->fDataSize;
2088 fRXPat->fDataSize += size;
2089 if (fRXPat->fDataSize >= 0x00fffff0) {
2090 error(U_REGEX_INTERNAL_ERROR);
2091 }
2092 return dataIndex;
2093 }
2094
2095
2096 //------------------------------------------------------------------------------
2097 //
2098 // allocateStackData() Allocate space in the back-tracking stack frame.
2099 // Return the index for the newly allocated data.
2100 // The frame indexes are inserted into various
2101 // opcodes while compiling the pattern, meaning that frame
2102 // size must be restricted to the size that will fit
2103 // as an operand (24 bits).
2104 //
2105 //------------------------------------------------------------------------------
allocateStackData(int32_t size)2106 int32_t RegexCompile::allocateStackData(int32_t size) {
2107 if (U_FAILURE(*fStatus)) {
2108 return 0;
2109 }
2110 if (size <= 0 || size > 0x100 || fRXPat->fFrameSize < 0) {
2111 error(U_REGEX_INTERNAL_ERROR);
2112 return 0;
2113 }
2114 int32_t dataIndex = fRXPat->fFrameSize;
2115 fRXPat->fFrameSize += size;
2116 if (fRXPat->fFrameSize >= 0x00fffff0) {
2117 error(U_REGEX_PATTERN_TOO_BIG);
2118 }
2119 return dataIndex;
2120 }
2121
2122
2123 //------------------------------------------------------------------------------
2124 //
2125 // blockTopLoc() Find or create a location in the compiled pattern
2126 // at the start of the operation or block that has
2127 // just been compiled. Needed when a quantifier (* or
2128 // whatever) appears, and we need to add an operation
2129 // at the start of the thing being quantified.
2130 //
2131 // (Parenthesized Blocks) have a slot with a NOP that
2132 // is reserved for this purpose. .* or similar don't
2133 // and a slot needs to be added.
2134 //
2135 // parameter reserveLoc : TRUE - ensure that there is space to add an opcode
2136 // at the returned location.
2137 // FALSE - just return the address,
2138 // do not reserve a location there.
2139 //
2140 //------------------------------------------------------------------------------
blockTopLoc(UBool reserveLoc)2141 int32_t RegexCompile::blockTopLoc(UBool reserveLoc) {
2142 int32_t theLoc;
2143 fixLiterals(TRUE); // Emit code for any pending literals.
2144 // If last item was a string, emit separate op for the its last char.
2145 if (fRXPat->fCompiledPat->size() == fMatchCloseParen)
2146 {
2147 // The item just processed is a parenthesized block.
2148 theLoc = fMatchOpenParen; // A slot is already reserved for us.
2149 U_ASSERT(theLoc > 0);
2150 U_ASSERT(URX_TYPE(((uint32_t)fRXPat->fCompiledPat->elementAti(theLoc))) == URX_NOP);
2151 }
2152 else {
2153 // Item just compiled is a single thing, a ".", or a single char, a string or a set reference.
2154 // No slot for STATE_SAVE was pre-reserved in the compiled code.
2155 // We need to make space now.
2156 theLoc = fRXPat->fCompiledPat->size()-1;
2157 int32_t opAtTheLoc = (int32_t)fRXPat->fCompiledPat->elementAti(theLoc);
2158 if (URX_TYPE(opAtTheLoc) == URX_STRING_LEN) {
2159 // Strings take two opcode, we want the position of the first one.
2160 // We can have a string at this point if a single character case-folded to two.
2161 theLoc--;
2162 }
2163 if (reserveLoc) {
2164 int32_t nop = buildOp(URX_NOP, 0);
2165 fRXPat->fCompiledPat->insertElementAt(nop, theLoc, *fStatus);
2166 }
2167 }
2168 return theLoc;
2169 }
2170
2171
2172
2173 //------------------------------------------------------------------------------
2174 //
2175 // handleCloseParen When compiling a close paren, we need to go back
2176 // and fix up any JMP or SAVE operations within the
2177 // parenthesized block that need to target the end
2178 // of the block. The locations of these are kept on
2179 // the paretheses stack.
2180 //
2181 // This function is called both when encountering a
2182 // real ) and at the end of the pattern.
2183 //
2184 //------------------------------------------------------------------------------
handleCloseParen()2185 void RegexCompile::handleCloseParen() {
2186 int32_t patIdx;
2187 int32_t patOp;
2188 if (fParenStack.size() <= 0) {
2189 error(U_REGEX_MISMATCHED_PAREN);
2190 return;
2191 }
2192
2193 // Emit code for any pending literals.
2194 fixLiterals(FALSE);
2195
2196 // Fixup any operations within the just-closed parenthesized group
2197 // that need to reference the end of the (block).
2198 // (The first one popped from the stack is an unused slot for
2199 // alternation (OR) state save, but applying the fixup to it does no harm.)
2200 for (;;) {
2201 patIdx = fParenStack.popi();
2202 if (patIdx < 0) {
2203 // value < 0 flags the start of the frame on the paren stack.
2204 break;
2205 }
2206 U_ASSERT(patIdx>0 && patIdx <= fRXPat->fCompiledPat->size());
2207 patOp = (int32_t)fRXPat->fCompiledPat->elementAti(patIdx);
2208 U_ASSERT(URX_VAL(patOp) == 0); // Branch target for JMP should not be set.
2209 patOp |= fRXPat->fCompiledPat->size(); // Set it now.
2210 fRXPat->fCompiledPat->setElementAt(patOp, patIdx);
2211 fMatchOpenParen = patIdx;
2212 }
2213
2214 // At the close of any parenthesized block, restore the match mode flags to
2215 // the value they had at the open paren. Saved value is
2216 // at the top of the paren stack.
2217 fModeFlags = fParenStack.popi();
2218 U_ASSERT(fModeFlags < 0);
2219
2220 // DO any additional fixups, depending on the specific kind of
2221 // parentesized grouping this is
2222
2223 switch (patIdx) {
2224 case plain:
2225 case flags:
2226 // No additional fixups required.
2227 // (Grouping-only parentheses)
2228 break;
2229 case capturing:
2230 // Capturing Parentheses.
2231 // Insert a End Capture op into the pattern.
2232 // The frame offset of the variables for this cg is obtained from the
2233 // start capture op and put it into the end-capture op.
2234 {
2235 int32_t captureOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen+1);
2236 U_ASSERT(URX_TYPE(captureOp) == URX_START_CAPTURE);
2237
2238 int32_t frameVarLocation = URX_VAL(captureOp);
2239 appendOp(URX_END_CAPTURE, frameVarLocation);
2240 }
2241 break;
2242 case atomic:
2243 // Atomic Parenthesis.
2244 // Insert a LD_SP operation to restore the state stack to the position
2245 // it was when the atomic parens were entered.
2246 {
2247 int32_t stoOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen+1);
2248 U_ASSERT(URX_TYPE(stoOp) == URX_STO_SP);
2249 int32_t stoLoc = URX_VAL(stoOp);
2250 appendOp(URX_LD_SP, stoLoc);
2251 }
2252 break;
2253
2254 case lookAhead:
2255 {
2256 int32_t startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-5);
2257 U_ASSERT(URX_TYPE(startOp) == URX_LA_START);
2258 int32_t dataLoc = URX_VAL(startOp);
2259 appendOp(URX_LA_END, dataLoc);
2260 }
2261 break;
2262
2263 case negLookAhead:
2264 {
2265 // See comment at doOpenLookAheadNeg
2266 int32_t startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-1);
2267 U_ASSERT(URX_TYPE(startOp) == URX_LA_START);
2268 int32_t dataLoc = URX_VAL(startOp);
2269 appendOp(URX_LA_END, dataLoc);
2270 appendOp(URX_BACKTRACK, 0);
2271 appendOp(URX_LA_END, dataLoc);
2272
2273 // Patch the URX_SAVE near the top of the block.
2274 // The destination of the SAVE is the final LA_END that was just added.
2275 int32_t saveOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen);
2276 U_ASSERT(URX_TYPE(saveOp) == URX_STATE_SAVE);
2277 int32_t dest = fRXPat->fCompiledPat->size()-1;
2278 saveOp = buildOp(URX_STATE_SAVE, dest);
2279 fRXPat->fCompiledPat->setElementAt(saveOp, fMatchOpenParen);
2280 }
2281 break;
2282
2283 case lookBehind:
2284 {
2285 // See comment at doOpenLookBehind.
2286
2287 // Append the URX_LB_END and URX_LA_END to the compiled pattern.
2288 int32_t startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-4);
2289 U_ASSERT(URX_TYPE(startOp) == URX_LB_START);
2290 int32_t dataLoc = URX_VAL(startOp);
2291 appendOp(URX_LB_END, dataLoc);
2292 appendOp(URX_LA_END, dataLoc);
2293
2294 // Determine the min and max bounds for the length of the
2295 // string that the pattern can match.
2296 // An unbounded upper limit is an error.
2297 int32_t patEnd = fRXPat->fCompiledPat->size() - 1;
2298 int32_t minML = minMatchLength(fMatchOpenParen, patEnd);
2299 int32_t maxML = maxMatchLength(fMatchOpenParen, patEnd);
2300 if (URX_TYPE(maxML) != 0) {
2301 error(U_REGEX_LOOK_BEHIND_LIMIT);
2302 break;
2303 }
2304 if (maxML == INT32_MAX) {
2305 error(U_REGEX_LOOK_BEHIND_LIMIT);
2306 break;
2307 }
2308 if (minML == INT32_MAX) {
2309 // This condition happens when no match is possible, such as with a
2310 // [set] expression containing no elements.
2311 // In principle, the generated code to evaluate the expression could be deleted,
2312 // but it's probably not worth the complication.
2313 minML = 0;
2314 }
2315 U_ASSERT(minML <= maxML);
2316
2317 // Insert the min and max match len bounds into the URX_LB_CONT op that
2318 // appears at the top of the look-behind block, at location fMatchOpenParen+1
2319 fRXPat->fCompiledPat->setElementAt(minML, fMatchOpenParen-2);
2320 fRXPat->fCompiledPat->setElementAt(maxML, fMatchOpenParen-1);
2321
2322 }
2323 break;
2324
2325
2326
2327 case lookBehindN:
2328 {
2329 // See comment at doOpenLookBehindNeg.
2330
2331 // Append the URX_LBN_END to the compiled pattern.
2332 int32_t startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-5);
2333 U_ASSERT(URX_TYPE(startOp) == URX_LB_START);
2334 int32_t dataLoc = URX_VAL(startOp);
2335 appendOp(URX_LBN_END, dataLoc);
2336
2337 // Determine the min and max bounds for the length of the
2338 // string that the pattern can match.
2339 // An unbounded upper limit is an error.
2340 int32_t patEnd = fRXPat->fCompiledPat->size() - 1;
2341 int32_t minML = minMatchLength(fMatchOpenParen, patEnd);
2342 int32_t maxML = maxMatchLength(fMatchOpenParen, patEnd);
2343 if (URX_TYPE(maxML) != 0) {
2344 error(U_REGEX_LOOK_BEHIND_LIMIT);
2345 break;
2346 }
2347 if (maxML == INT32_MAX) {
2348 error(U_REGEX_LOOK_BEHIND_LIMIT);
2349 break;
2350 }
2351 if (minML == INT32_MAX) {
2352 // This condition happens when no match is possible, such as with a
2353 // [set] expression containing no elements.
2354 // In principle, the generated code to evaluate the expression could be deleted,
2355 // but it's probably not worth the complication.
2356 minML = 0;
2357 }
2358
2359 U_ASSERT(minML <= maxML);
2360
2361 // Insert the min and max match len bounds into the URX_LB_CONT op that
2362 // appears at the top of the look-behind block, at location fMatchOpenParen+1
2363 fRXPat->fCompiledPat->setElementAt(minML, fMatchOpenParen-3);
2364 fRXPat->fCompiledPat->setElementAt(maxML, fMatchOpenParen-2);
2365
2366 // Insert the pattern location to continue at after a successful match
2367 // as the last operand of the URX_LBN_CONT
2368 int32_t op = buildOp(URX_RELOC_OPRND, fRXPat->fCompiledPat->size());
2369 fRXPat->fCompiledPat->setElementAt(op, fMatchOpenParen-1);
2370 }
2371 break;
2372
2373
2374
2375 default:
2376 UPRV_UNREACHABLE;
2377 }
2378
2379 // remember the next location in the compiled pattern.
2380 // The compilation of Quantifiers will look at this to see whether its looping
2381 // over a parenthesized block or a single item
2382 fMatchCloseParen = fRXPat->fCompiledPat->size();
2383 }
2384
2385
2386
2387 //------------------------------------------------------------------------------
2388 //
2389 // compileSet Compile the pattern operations for a reference to a
2390 // UnicodeSet.
2391 //
2392 //------------------------------------------------------------------------------
compileSet(UnicodeSet * theSet)2393 void RegexCompile::compileSet(UnicodeSet *theSet)
2394 {
2395 if (theSet == NULL) {
2396 return;
2397 }
2398 // Remove any strings from the set.
2399 // There shoudn't be any, but just in case.
2400 // (Case Closure can add them; if we had a simple case closure avaialble that
2401 // ignored strings, that would be better.)
2402 theSet->removeAllStrings();
2403 int32_t setSize = theSet->size();
2404
2405 switch (setSize) {
2406 case 0:
2407 {
2408 // Set of no elements. Always fails to match.
2409 appendOp(URX_BACKTRACK, 0);
2410 delete theSet;
2411 }
2412 break;
2413
2414 case 1:
2415 {
2416 // The set contains only a single code point. Put it into
2417 // the compiled pattern as a single char operation rather
2418 // than a set, and discard the set itself.
2419 literalChar(theSet->charAt(0));
2420 delete theSet;
2421 }
2422 break;
2423
2424 default:
2425 {
2426 // The set contains two or more chars. (the normal case)
2427 // Put it into the compiled pattern as a set.
2428 theSet->freeze();
2429 int32_t setNumber = fRXPat->fSets->size();
2430 fRXPat->fSets->addElement(theSet, *fStatus);
2431 appendOp(URX_SETREF, setNumber);
2432 }
2433 }
2434 }
2435
2436
2437 //------------------------------------------------------------------------------
2438 //
2439 // compileInterval Generate the code for a {min, max} style interval quantifier.
2440 // Except for the specific opcodes used, the code is the same
2441 // for all three types (greedy, non-greedy, possessive) of
2442 // intervals. The opcodes are supplied as parameters.
2443 // (There are two sets of opcodes - greedy & possessive use the
2444 // same ones, while non-greedy has it's own.)
2445 //
2446 // The code for interval loops has this form:
2447 // 0 CTR_INIT counter loc (in stack frame)
2448 // 1 5 patt address of CTR_LOOP at bottom of block
2449 // 2 min count
2450 // 3 max count (-1 for unbounded)
2451 // 4 ... block to be iterated over
2452 // 5 CTR_LOOP
2453 //
2454 // In
2455 //------------------------------------------------------------------------------
compileInterval(int32_t InitOp,int32_t LoopOp)2456 void RegexCompile::compileInterval(int32_t InitOp, int32_t LoopOp)
2457 {
2458 // The CTR_INIT op at the top of the block with the {n,m} quantifier takes
2459 // four slots in the compiled code. Reserve them.
2460 int32_t topOfBlock = blockTopLoc(TRUE);
2461 insertOp(topOfBlock);
2462 insertOp(topOfBlock);
2463 insertOp(topOfBlock);
2464
2465 // The operands for the CTR_INIT opcode include the index in the matcher data
2466 // of the counter. Allocate it now. There are two data items
2467 // counterLoc --> Loop counter
2468 // +1 --> Input index (for breaking non-progressing loops)
2469 // (Only present if unbounded upper limit on loop)
2470 int32_t dataSize = fIntervalUpper < 0 ? 2 : 1;
2471 int32_t counterLoc = allocateStackData(dataSize);
2472
2473 int32_t op = buildOp(InitOp, counterLoc);
2474 fRXPat->fCompiledPat->setElementAt(op, topOfBlock);
2475
2476 // The second operand of CTR_INIT is the location following the end of the loop.
2477 // Must put in as a URX_RELOC_OPRND so that the value will be adjusted if the
2478 // compilation of something later on causes the code to grow and the target
2479 // position to move.
2480 int32_t loopEnd = fRXPat->fCompiledPat->size();
2481 op = buildOp(URX_RELOC_OPRND, loopEnd);
2482 fRXPat->fCompiledPat->setElementAt(op, topOfBlock+1);
2483
2484 // Followed by the min and max counts.
2485 fRXPat->fCompiledPat->setElementAt(fIntervalLow, topOfBlock+2);
2486 fRXPat->fCompiledPat->setElementAt(fIntervalUpper, topOfBlock+3);
2487
2488 // Apend the CTR_LOOP op. The operand is the location of the CTR_INIT op.
2489 // Goes at end of the block being looped over, so just append to the code so far.
2490 appendOp(LoopOp, topOfBlock);
2491
2492 if ((fIntervalLow & 0xff000000) != 0 ||
2493 (fIntervalUpper > 0 && (fIntervalUpper & 0xff000000) != 0)) {
2494 error(U_REGEX_NUMBER_TOO_BIG);
2495 }
2496
2497 if (fIntervalLow > fIntervalUpper && fIntervalUpper != -1) {
2498 error(U_REGEX_MAX_LT_MIN);
2499 }
2500 }
2501
2502
2503
compileInlineInterval()2504 UBool RegexCompile::compileInlineInterval() {
2505 if (fIntervalUpper > 10 || fIntervalUpper < fIntervalLow) {
2506 // Too big to inline. Fail, which will cause looping code to be generated.
2507 // (Upper < Lower picks up unbounded upper and errors, both.)
2508 return FALSE;
2509 }
2510
2511 int32_t topOfBlock = blockTopLoc(FALSE);
2512 if (fIntervalUpper == 0) {
2513 // Pathological case. Attempt no matches, as if the block doesn't exist.
2514 // Discard the generated code for the block.
2515 // If the block included parens, discard the info pertaining to them as well.
2516 fRXPat->fCompiledPat->setSize(topOfBlock);
2517 if (fMatchOpenParen >= topOfBlock) {
2518 fMatchOpenParen = -1;
2519 }
2520 if (fMatchCloseParen >= topOfBlock) {
2521 fMatchCloseParen = -1;
2522 }
2523 return TRUE;
2524 }
2525
2526 if (topOfBlock != fRXPat->fCompiledPat->size()-1 && fIntervalUpper != 1) {
2527 // The thing being repeated is not a single op, but some
2528 // more complex block. Do it as a loop, not inlines.
2529 // Note that things "repeated" a max of once are handled as inline, because
2530 // the one copy of the code already generated is just fine.
2531 return FALSE;
2532 }
2533
2534 // Pick up the opcode that is to be repeated
2535 //
2536 int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(topOfBlock);
2537
2538 // Compute the pattern location where the inline sequence
2539 // will end, and set up the state save op that will be needed.
2540 //
2541 int32_t endOfSequenceLoc = fRXPat->fCompiledPat->size()-1
2542 + fIntervalUpper + (fIntervalUpper-fIntervalLow);
2543 int32_t saveOp = buildOp(URX_STATE_SAVE, endOfSequenceLoc);
2544 if (fIntervalLow == 0) {
2545 insertOp(topOfBlock);
2546 fRXPat->fCompiledPat->setElementAt(saveOp, topOfBlock);
2547 }
2548
2549
2550
2551 // Loop, emitting the op for the thing being repeated each time.
2552 // Loop starts at 1 because one instance of the op already exists in the pattern,
2553 // it was put there when it was originally encountered.
2554 int32_t i;
2555 for (i=1; i<fIntervalUpper; i++ ) {
2556 if (i >= fIntervalLow) {
2557 appendOp(saveOp);
2558 }
2559 appendOp(op);
2560 }
2561 return TRUE;
2562 }
2563
2564
2565
2566 //------------------------------------------------------------------------------
2567 //
2568 // caseInsensitiveStart given a single code point from a pattern string, determine the
2569 // set of characters that could potentially begin a case-insensitive
2570 // match of a string beginning with that character, using full Unicode
2571 // case insensitive matching.
2572 //
2573 // This is used in optimizing find().
2574 //
2575 // closeOver(USET_CASE_INSENSITIVE) does most of what is needed, but
2576 // misses cases like this:
2577 // A string from the pattern begins with 'ss' (although all we know
2578 // in this context is that it begins with 's')
2579 // The pattern could match a string beginning with a German sharp-s
2580 //
2581 // To the ordinary case closure for a character c, we add all other
2582 // characters cx where the case closure of cx incudes a string form that begins
2583 // with the original character c.
2584 //
2585 // This function could be made smarter. The full pattern string is available
2586 // and it would be possible to verify that the extra characters being added
2587 // to the starting set fully match, rather than having just a first-char of the
2588 // folded form match.
2589 //
2590 //------------------------------------------------------------------------------
findCaseInsensitiveStarters(UChar32 c,UnicodeSet * starterChars)2591 void RegexCompile::findCaseInsensitiveStarters(UChar32 c, UnicodeSet *starterChars) {
2592
2593 // Machine Generated below.
2594 // It may need updating with new versions of Unicode.
2595 // Intltest test RegexTest::TestCaseInsensitiveStarters will fail if an update is needed.
2596 // The update tool is here: svn+ssh://source.icu-project.org/repos/icu/tools/trunk/unicode/c/genregexcasing
2597
2598 // Machine Generated Data. Do not hand edit.
2599 static const UChar32 RECaseFixCodePoints[] = {
2600 0x61, 0x66, 0x68, 0x69, 0x6a, 0x73, 0x74, 0x77, 0x79, 0x2bc,
2601 0x3ac, 0x3ae, 0x3b1, 0x3b7, 0x3b9, 0x3c1, 0x3c5, 0x3c9, 0x3ce, 0x565,
2602 0x574, 0x57e, 0x1f00, 0x1f01, 0x1f02, 0x1f03, 0x1f04, 0x1f05, 0x1f06, 0x1f07,
2603 0x1f20, 0x1f21, 0x1f22, 0x1f23, 0x1f24, 0x1f25, 0x1f26, 0x1f27, 0x1f60, 0x1f61,
2604 0x1f62, 0x1f63, 0x1f64, 0x1f65, 0x1f66, 0x1f67, 0x1f70, 0x1f74, 0x1f7c, 0x110000};
2605
2606 static const int16_t RECaseFixStringOffsets[] = {
2607 0x0, 0x1, 0x6, 0x7, 0x8, 0x9, 0xd, 0xe, 0xf, 0x10,
2608 0x11, 0x12, 0x13, 0x17, 0x1b, 0x20, 0x21, 0x2a, 0x2e, 0x2f,
2609 0x30, 0x34, 0x35, 0x37, 0x39, 0x3b, 0x3d, 0x3f, 0x41, 0x43,
2610 0x45, 0x47, 0x49, 0x4b, 0x4d, 0x4f, 0x51, 0x53, 0x55, 0x57,
2611 0x59, 0x5b, 0x5d, 0x5f, 0x61, 0x63, 0x65, 0x66, 0x67, 0};
2612
2613 static const int16_t RECaseFixCounts[] = {
2614 0x1, 0x5, 0x1, 0x1, 0x1, 0x4, 0x1, 0x1, 0x1, 0x1,
2615 0x1, 0x1, 0x4, 0x4, 0x5, 0x1, 0x9, 0x4, 0x1, 0x1,
2616 0x4, 0x1, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2,
2617 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2,
2618 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x1, 0x1, 0x1, 0};
2619
2620 static const UChar RECaseFixData[] = {
2621 0x1e9a, 0xfb00, 0xfb01, 0xfb02, 0xfb03, 0xfb04, 0x1e96, 0x130, 0x1f0, 0xdf,
2622 0x1e9e, 0xfb05, 0xfb06, 0x1e97, 0x1e98, 0x1e99, 0x149, 0x1fb4, 0x1fc4, 0x1fb3,
2623 0x1fb6, 0x1fb7, 0x1fbc, 0x1fc3, 0x1fc6, 0x1fc7, 0x1fcc, 0x390, 0x1fd2, 0x1fd3,
2624 0x1fd6, 0x1fd7, 0x1fe4, 0x3b0, 0x1f50, 0x1f52, 0x1f54, 0x1f56, 0x1fe2, 0x1fe3,
2625 0x1fe6, 0x1fe7, 0x1ff3, 0x1ff6, 0x1ff7, 0x1ffc, 0x1ff4, 0x587, 0xfb13, 0xfb14,
2626 0xfb15, 0xfb17, 0xfb16, 0x1f80, 0x1f88, 0x1f81, 0x1f89, 0x1f82, 0x1f8a, 0x1f83,
2627 0x1f8b, 0x1f84, 0x1f8c, 0x1f85, 0x1f8d, 0x1f86, 0x1f8e, 0x1f87, 0x1f8f, 0x1f90,
2628 0x1f98, 0x1f91, 0x1f99, 0x1f92, 0x1f9a, 0x1f93, 0x1f9b, 0x1f94, 0x1f9c, 0x1f95,
2629 0x1f9d, 0x1f96, 0x1f9e, 0x1f97, 0x1f9f, 0x1fa0, 0x1fa8, 0x1fa1, 0x1fa9, 0x1fa2,
2630 0x1faa, 0x1fa3, 0x1fab, 0x1fa4, 0x1fac, 0x1fa5, 0x1fad, 0x1fa6, 0x1fae, 0x1fa7,
2631 0x1faf, 0x1fb2, 0x1fc2, 0x1ff2, 0};
2632
2633 // End of machine generated data.
2634
2635 if (c < UCHAR_MIN_VALUE || c > UCHAR_MAX_VALUE) {
2636 // This function should never be called with an invalid input character.
2637 UPRV_UNREACHABLE;
2638 } else if (u_hasBinaryProperty(c, UCHAR_CASE_SENSITIVE)) {
2639 UChar32 caseFoldedC = u_foldCase(c, U_FOLD_CASE_DEFAULT);
2640 starterChars->set(caseFoldedC, caseFoldedC);
2641
2642 int32_t i;
2643 for (i=0; RECaseFixCodePoints[i]<c ; i++) {
2644 // Simple linear search through the sorted list of interesting code points.
2645 }
2646
2647 if (RECaseFixCodePoints[i] == c) {
2648 int32_t dataIndex = RECaseFixStringOffsets[i];
2649 int32_t numCharsToAdd = RECaseFixCounts[i];
2650 UChar32 cpToAdd = 0;
2651 for (int32_t j=0; j<numCharsToAdd; j++) {
2652 U16_NEXT_UNSAFE(RECaseFixData, dataIndex, cpToAdd);
2653 starterChars->add(cpToAdd);
2654 }
2655 }
2656
2657 starterChars->closeOver(USET_CASE_INSENSITIVE);
2658 starterChars->removeAllStrings();
2659 } else {
2660 // Not a cased character. Just return it alone.
2661 starterChars->set(c, c);
2662 }
2663 }
2664
2665
2666 // Increment with overflow check.
2667 // val and delta will both be positive.
2668
safeIncrement(int32_t val,int32_t delta)2669 static int32_t safeIncrement(int32_t val, int32_t delta) {
2670 if (INT32_MAX - val > delta) {
2671 return val + delta;
2672 } else {
2673 return INT32_MAX;
2674 }
2675 }
2676
2677
2678 //------------------------------------------------------------------------------
2679 //
2680 // matchStartType Determine how a match can start.
2681 // Used to optimize find() operations.
2682 //
2683 // Operation is very similar to minMatchLength(). Walk the compiled
2684 // pattern, keeping an on-going minimum-match-length. For any
2685 // op where the min match coming in is zero, add that ops possible
2686 // starting matches to the possible starts for the overall pattern.
2687 //
2688 //------------------------------------------------------------------------------
matchStartType()2689 void RegexCompile::matchStartType() {
2690 if (U_FAILURE(*fStatus)) {
2691 return;
2692 }
2693
2694
2695 int32_t loc; // Location in the pattern of the current op being processed.
2696 int32_t op; // The op being processed
2697 int32_t opType; // The opcode type of the op
2698 int32_t currentLen = 0; // Minimum length of a match to this point (loc) in the pattern
2699 int32_t numInitialStrings = 0; // Number of strings encountered that could match at start.
2700
2701 UBool atStart = TRUE; // True if no part of the pattern yet encountered
2702 // could have advanced the position in a match.
2703 // (Maximum match length so far == 0)
2704
2705 // forwardedLength is a vector holding minimum-match-length values that
2706 // are propagated forward in the pattern by JMP or STATE_SAVE operations.
2707 // It must be one longer than the pattern being checked because some ops
2708 // will jmp to a end-of-block+1 location from within a block, and we must
2709 // count those when checking the block.
2710 int32_t end = fRXPat->fCompiledPat->size();
2711 UVector32 forwardedLength(end+1, *fStatus);
2712 forwardedLength.setSize(end+1);
2713 for (loc=3; loc<end; loc++) {
2714 forwardedLength.setElementAt(INT32_MAX, loc);
2715 }
2716
2717 for (loc = 3; loc<end; loc++) {
2718 op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
2719 opType = URX_TYPE(op);
2720
2721 // The loop is advancing linearly through the pattern.
2722 // If the op we are now at was the destination of a branch in the pattern,
2723 // and that path has a shorter minimum length than the current accumulated value,
2724 // replace the current accumulated value.
2725 if (forwardedLength.elementAti(loc) < currentLen) {
2726 currentLen = forwardedLength.elementAti(loc);
2727 U_ASSERT(currentLen>=0 && currentLen < INT32_MAX);
2728 }
2729
2730 switch (opType) {
2731 // Ops that don't change the total length matched
2732 case URX_RESERVED_OP:
2733 case URX_END:
2734 case URX_FAIL:
2735 case URX_STRING_LEN:
2736 case URX_NOP:
2737 case URX_START_CAPTURE:
2738 case URX_END_CAPTURE:
2739 case URX_BACKSLASH_B:
2740 case URX_BACKSLASH_BU:
2741 case URX_BACKSLASH_G:
2742 case URX_BACKSLASH_Z:
2743 case URX_DOLLAR:
2744 case URX_DOLLAR_M:
2745 case URX_DOLLAR_D:
2746 case URX_DOLLAR_MD:
2747 case URX_RELOC_OPRND:
2748 case URX_STO_INP_LOC:
2749 case URX_BACKREF: // BackRef. Must assume that it might be a zero length match
2750 case URX_BACKREF_I:
2751
2752 case URX_STO_SP: // Setup for atomic or possessive blocks. Doesn't change what can match.
2753 case URX_LD_SP:
2754 break;
2755
2756 case URX_CARET:
2757 if (atStart) {
2758 fRXPat->fStartType = START_START;
2759 }
2760 break;
2761
2762 case URX_CARET_M:
2763 case URX_CARET_M_UNIX:
2764 if (atStart) {
2765 fRXPat->fStartType = START_LINE;
2766 }
2767 break;
2768
2769 case URX_ONECHAR:
2770 if (currentLen == 0) {
2771 // This character could appear at the start of a match.
2772 // Add it to the set of possible starting characters.
2773 fRXPat->fInitialChars->add(URX_VAL(op));
2774 numInitialStrings += 2;
2775 }
2776 currentLen = safeIncrement(currentLen, 1);
2777 atStart = FALSE;
2778 break;
2779
2780
2781 case URX_SETREF:
2782 if (currentLen == 0) {
2783 int32_t sn = URX_VAL(op);
2784 U_ASSERT(sn > 0 && sn < fRXPat->fSets->size());
2785 const UnicodeSet *s = (UnicodeSet *)fRXPat->fSets->elementAt(sn);
2786 fRXPat->fInitialChars->addAll(*s);
2787 numInitialStrings += 2;
2788 }
2789 currentLen = safeIncrement(currentLen, 1);
2790 atStart = FALSE;
2791 break;
2792
2793 case URX_LOOP_SR_I:
2794 // [Set]*, like a SETREF, above, in what it can match,
2795 // but may not match at all, so currentLen is not incremented.
2796 if (currentLen == 0) {
2797 int32_t sn = URX_VAL(op);
2798 U_ASSERT(sn > 0 && sn < fRXPat->fSets->size());
2799 const UnicodeSet *s = (UnicodeSet *)fRXPat->fSets->elementAt(sn);
2800 fRXPat->fInitialChars->addAll(*s);
2801 numInitialStrings += 2;
2802 }
2803 atStart = FALSE;
2804 break;
2805
2806 case URX_LOOP_DOT_I:
2807 if (currentLen == 0) {
2808 // .* at the start of a pattern.
2809 // Any character can begin the match.
2810 fRXPat->fInitialChars->clear();
2811 fRXPat->fInitialChars->complement();
2812 numInitialStrings += 2;
2813 }
2814 atStart = FALSE;
2815 break;
2816
2817
2818 case URX_STATIC_SETREF:
2819 if (currentLen == 0) {
2820 int32_t sn = URX_VAL(op);
2821 U_ASSERT(sn>0 && sn<URX_LAST_SET);
2822 const UnicodeSet &s = RegexStaticSets::gStaticSets->fPropSets[sn];
2823 fRXPat->fInitialChars->addAll(s);
2824 numInitialStrings += 2;
2825 }
2826 currentLen = safeIncrement(currentLen, 1);
2827 atStart = FALSE;
2828 break;
2829
2830
2831
2832 case URX_STAT_SETREF_N:
2833 if (currentLen == 0) {
2834 int32_t sn = URX_VAL(op);
2835 UnicodeSet sc;
2836 sc.addAll(RegexStaticSets::gStaticSets->fPropSets[sn]).complement();
2837 fRXPat->fInitialChars->addAll(sc);
2838 numInitialStrings += 2;
2839 }
2840 currentLen = safeIncrement(currentLen, 1);
2841 atStart = FALSE;
2842 break;
2843
2844
2845
2846 case URX_BACKSLASH_D:
2847 // Digit Char
2848 if (currentLen == 0) {
2849 UnicodeSet s;
2850 s.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ND_MASK, *fStatus);
2851 if (URX_VAL(op) != 0) {
2852 s.complement();
2853 }
2854 fRXPat->fInitialChars->addAll(s);
2855 numInitialStrings += 2;
2856 }
2857 currentLen = safeIncrement(currentLen, 1);
2858 atStart = FALSE;
2859 break;
2860
2861
2862 case URX_BACKSLASH_H:
2863 // Horiz white space
2864 if (currentLen == 0) {
2865 UnicodeSet s;
2866 s.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ZS_MASK, *fStatus);
2867 s.add((UChar32)9); // Tab
2868 if (URX_VAL(op) != 0) {
2869 s.complement();
2870 }
2871 fRXPat->fInitialChars->addAll(s);
2872 numInitialStrings += 2;
2873 }
2874 currentLen = safeIncrement(currentLen, 1);
2875 atStart = FALSE;
2876 break;
2877
2878
2879 case URX_BACKSLASH_R: // Any line ending sequence
2880 case URX_BACKSLASH_V: // Any line ending code point, with optional negation
2881 if (currentLen == 0) {
2882 UnicodeSet s;
2883 s.add((UChar32)0x0a, (UChar32)0x0d); // add range
2884 s.add((UChar32)0x85);
2885 s.add((UChar32)0x2028, (UChar32)0x2029);
2886 if (URX_VAL(op) != 0) {
2887 // Complement option applies to URX_BACKSLASH_V only.
2888 s.complement();
2889 }
2890 fRXPat->fInitialChars->addAll(s);
2891 numInitialStrings += 2;
2892 }
2893 currentLen = safeIncrement(currentLen, 1);
2894 atStart = FALSE;
2895 break;
2896
2897
2898
2899 case URX_ONECHAR_I:
2900 // Case Insensitive Single Character.
2901 if (currentLen == 0) {
2902 UChar32 c = URX_VAL(op);
2903 if (u_hasBinaryProperty(c, UCHAR_CASE_SENSITIVE)) {
2904 UnicodeSet starters(c, c);
2905 starters.closeOver(USET_CASE_INSENSITIVE);
2906 // findCaseInsensitiveStarters(c, &starters);
2907 // For ONECHAR_I, no need to worry about text chars that expand on folding into strings.
2908 // The expanded folding can't match the pattern.
2909 fRXPat->fInitialChars->addAll(starters);
2910 } else {
2911 // Char has no case variants. Just add it as-is to the
2912 // set of possible starting chars.
2913 fRXPat->fInitialChars->add(c);
2914 }
2915 numInitialStrings += 2;
2916 }
2917 currentLen = safeIncrement(currentLen, 1);
2918 atStart = FALSE;
2919 break;
2920
2921
2922 case URX_BACKSLASH_X: // Grahpeme Cluster. Minimum is 1, max unbounded.
2923 case URX_DOTANY_ALL: // . matches one or two.
2924 case URX_DOTANY:
2925 case URX_DOTANY_UNIX:
2926 if (currentLen == 0) {
2927 // These constructs are all bad news when they appear at the start
2928 // of a match. Any character can begin the match.
2929 fRXPat->fInitialChars->clear();
2930 fRXPat->fInitialChars->complement();
2931 numInitialStrings += 2;
2932 }
2933 currentLen = safeIncrement(currentLen, 1);
2934 atStart = FALSE;
2935 break;
2936
2937
2938 case URX_JMPX:
2939 loc++; // Except for extra operand on URX_JMPX, same as URX_JMP.
2940 U_FALLTHROUGH;
2941 case URX_JMP:
2942 {
2943 int32_t jmpDest = URX_VAL(op);
2944 if (jmpDest < loc) {
2945 // Loop of some kind. Can safely ignore, the worst that will happen
2946 // is that we understate the true minimum length
2947 currentLen = forwardedLength.elementAti(loc+1);
2948
2949 } else {
2950 // Forward jump. Propagate the current min length to the target loc of the jump.
2951 U_ASSERT(jmpDest <= end+1);
2952 if (forwardedLength.elementAti(jmpDest) > currentLen) {
2953 forwardedLength.setElementAt(currentLen, jmpDest);
2954 }
2955 }
2956 }
2957 atStart = FALSE;
2958 break;
2959
2960 case URX_JMP_SAV:
2961 case URX_JMP_SAV_X:
2962 // Combo of state save to the next loc, + jmp backwards.
2963 // Net effect on min. length computation is nothing.
2964 atStart = FALSE;
2965 break;
2966
2967 case URX_BACKTRACK:
2968 // Fails are kind of like a branch, except that the min length was
2969 // propagated already, by the state save.
2970 currentLen = forwardedLength.elementAti(loc+1);
2971 atStart = FALSE;
2972 break;
2973
2974
2975 case URX_STATE_SAVE:
2976 {
2977 // State Save, for forward jumps, propagate the current minimum.
2978 // of the state save.
2979 int32_t jmpDest = URX_VAL(op);
2980 if (jmpDest > loc) {
2981 if (currentLen < forwardedLength.elementAti(jmpDest)) {
2982 forwardedLength.setElementAt(currentLen, jmpDest);
2983 }
2984 }
2985 }
2986 atStart = FALSE;
2987 break;
2988
2989
2990
2991
2992 case URX_STRING:
2993 {
2994 loc++;
2995 int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
2996 int32_t stringLen = URX_VAL(stringLenOp);
2997 U_ASSERT(URX_TYPE(stringLenOp) == URX_STRING_LEN);
2998 U_ASSERT(stringLenOp >= 2);
2999 if (currentLen == 0) {
3000 // Add the starting character of this string to the set of possible starting
3001 // characters for this pattern.
3002 int32_t stringStartIdx = URX_VAL(op);
3003 UChar32 c = fRXPat->fLiteralText.char32At(stringStartIdx);
3004 fRXPat->fInitialChars->add(c);
3005
3006 // Remember this string. After the entire pattern has been checked,
3007 // if nothing else is identified that can start a match, we'll use it.
3008 numInitialStrings++;
3009 fRXPat->fInitialStringIdx = stringStartIdx;
3010 fRXPat->fInitialStringLen = stringLen;
3011 }
3012
3013 currentLen = safeIncrement(currentLen, stringLen);
3014 atStart = FALSE;
3015 }
3016 break;
3017
3018 case URX_STRING_I:
3019 {
3020 // Case-insensitive string. Unlike exact-match strings, we won't
3021 // attempt a string search for possible match positions. But we
3022 // do update the set of possible starting characters.
3023 loc++;
3024 int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3025 int32_t stringLen = URX_VAL(stringLenOp);
3026 U_ASSERT(URX_TYPE(stringLenOp) == URX_STRING_LEN);
3027 U_ASSERT(stringLenOp >= 2);
3028 if (currentLen == 0) {
3029 // Add the starting character of this string to the set of possible starting
3030 // characters for this pattern.
3031 int32_t stringStartIdx = URX_VAL(op);
3032 UChar32 c = fRXPat->fLiteralText.char32At(stringStartIdx);
3033 UnicodeSet s;
3034 findCaseInsensitiveStarters(c, &s);
3035 fRXPat->fInitialChars->addAll(s);
3036 numInitialStrings += 2; // Matching on an initial string not possible.
3037 }
3038 currentLen = safeIncrement(currentLen, stringLen);
3039 atStart = FALSE;
3040 }
3041 break;
3042
3043 case URX_CTR_INIT:
3044 case URX_CTR_INIT_NG:
3045 {
3046 // Loop Init Ops. These don't change the min length, but they are 4 word ops
3047 // so location must be updated accordingly.
3048 // Loop Init Ops.
3049 // If the min loop count == 0
3050 // move loc forwards to the end of the loop, skipping over the body.
3051 // If the min count is > 0,
3052 // continue normal processing of the body of the loop.
3053 int32_t loopEndLoc = (int32_t)fRXPat->fCompiledPat->elementAti(loc+1);
3054 loopEndLoc = URX_VAL(loopEndLoc);
3055 int32_t minLoopCount = (int32_t)fRXPat->fCompiledPat->elementAti(loc+2);
3056 if (minLoopCount == 0) {
3057 // Min Loop Count of 0, treat like a forward branch and
3058 // move the current minimum length up to the target
3059 // (end of loop) location.
3060 U_ASSERT(loopEndLoc <= end+1);
3061 if (forwardedLength.elementAti(loopEndLoc) > currentLen) {
3062 forwardedLength.setElementAt(currentLen, loopEndLoc);
3063 }
3064 }
3065 loc+=3; // Skips over operands of CTR_INIT
3066 }
3067 atStart = FALSE;
3068 break;
3069
3070
3071 case URX_CTR_LOOP:
3072 case URX_CTR_LOOP_NG:
3073 // Loop ops.
3074 // The jump is conditional, backwards only.
3075 atStart = FALSE;
3076 break;
3077
3078 case URX_LOOP_C:
3079 // More loop ops. These state-save to themselves.
3080 // don't change the minimum match
3081 atStart = FALSE;
3082 break;
3083
3084
3085 case URX_LA_START:
3086 case URX_LB_START:
3087 {
3088 // Look-around. Scan forward until the matching look-ahead end,
3089 // without processing the look-around block. This is overly pessimistic.
3090
3091 // Keep track of the nesting depth of look-around blocks. Boilerplate code for
3092 // lookahead contains two LA_END instructions, so count goes up by two
3093 // for each LA_START.
3094 int32_t depth = (opType == URX_LA_START? 2: 1);
3095 for (;;) {
3096 loc++;
3097 op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3098 if (URX_TYPE(op) == URX_LA_START) {
3099 depth+=2;
3100 }
3101 if (URX_TYPE(op) == URX_LB_START) {
3102 depth++;
3103 }
3104 if (URX_TYPE(op) == URX_LA_END || URX_TYPE(op)==URX_LBN_END) {
3105 depth--;
3106 if (depth == 0) {
3107 break;
3108 }
3109 }
3110 if (URX_TYPE(op) == URX_STATE_SAVE) {
3111 // Need this because neg lookahead blocks will FAIL to outside
3112 // of the block.
3113 int32_t jmpDest = URX_VAL(op);
3114 if (jmpDest > loc) {
3115 if (currentLen < forwardedLength.elementAti(jmpDest)) {
3116 forwardedLength.setElementAt(currentLen, jmpDest);
3117 }
3118 }
3119 }
3120 U_ASSERT(loc <= end);
3121 }
3122 }
3123 break;
3124
3125 case URX_LA_END:
3126 case URX_LB_CONT:
3127 case URX_LB_END:
3128 case URX_LBN_CONT:
3129 case URX_LBN_END:
3130 UPRV_UNREACHABLE; // Shouldn't get here. These ops should be
3131 // consumed by the scan in URX_LA_START and LB_START
3132 default:
3133 UPRV_UNREACHABLE;
3134 }
3135
3136 }
3137
3138
3139 // We have finished walking through the ops. Check whether some forward jump
3140 // propagated a shorter length to location end+1.
3141 if (forwardedLength.elementAti(end+1) < currentLen) {
3142 currentLen = forwardedLength.elementAti(end+1);
3143 }
3144
3145
3146 fRXPat->fInitialChars8->init(fRXPat->fInitialChars);
3147
3148
3149 // Sort out what we should check for when looking for candidate match start positions.
3150 // In order of preference,
3151 // 1. Start of input text buffer.
3152 // 2. A literal string.
3153 // 3. Start of line in multi-line mode.
3154 // 4. A single literal character.
3155 // 5. A character from a set of characters.
3156 //
3157 if (fRXPat->fStartType == START_START) {
3158 // Match only at the start of an input text string.
3159 // start type is already set. We're done.
3160 } else if (numInitialStrings == 1 && fRXPat->fMinMatchLen > 0) {
3161 // Match beginning only with a literal string.
3162 UChar32 c = fRXPat->fLiteralText.char32At(fRXPat->fInitialStringIdx);
3163 U_ASSERT(fRXPat->fInitialChars->contains(c));
3164 fRXPat->fStartType = START_STRING;
3165 fRXPat->fInitialChar = c;
3166 } else if (fRXPat->fStartType == START_LINE) {
3167 // Match at start of line in Multi-Line mode.
3168 // Nothing to do here; everything is already set.
3169 } else if (fRXPat->fMinMatchLen == 0) {
3170 // Zero length match possible. We could start anywhere.
3171 fRXPat->fStartType = START_NO_INFO;
3172 } else if (fRXPat->fInitialChars->size() == 1) {
3173 // All matches begin with the same char.
3174 fRXPat->fStartType = START_CHAR;
3175 fRXPat->fInitialChar = fRXPat->fInitialChars->charAt(0);
3176 U_ASSERT(fRXPat->fInitialChar != (UChar32)-1);
3177 } else if (fRXPat->fInitialChars->contains((UChar32)0, (UChar32)0x10ffff) == FALSE &&
3178 fRXPat->fMinMatchLen > 0) {
3179 // Matches start with a set of character smaller than the set of all chars.
3180 fRXPat->fStartType = START_SET;
3181 } else {
3182 // Matches can start with anything
3183 fRXPat->fStartType = START_NO_INFO;
3184 }
3185
3186 return;
3187 }
3188
3189
3190
3191 //------------------------------------------------------------------------------
3192 //
3193 // minMatchLength Calculate the length of the shortest string that could
3194 // match the specified pattern.
3195 // Length is in 16 bit code units, not code points.
3196 //
3197 // The calculated length may not be exact. The returned
3198 // value may be shorter than the actual minimum; it must
3199 // never be longer.
3200 //
3201 // start and end are the range of p-code operations to be
3202 // examined. The endpoints are included in the range.
3203 //
3204 //------------------------------------------------------------------------------
minMatchLength(int32_t start,int32_t end)3205 int32_t RegexCompile::minMatchLength(int32_t start, int32_t end) {
3206 if (U_FAILURE(*fStatus)) {
3207 return 0;
3208 }
3209
3210 U_ASSERT(start <= end);
3211 U_ASSERT(end < fRXPat->fCompiledPat->size());
3212
3213
3214 int32_t loc;
3215 int32_t op;
3216 int32_t opType;
3217 int32_t currentLen = 0;
3218
3219
3220 // forwardedLength is a vector holding minimum-match-length values that
3221 // are propagated forward in the pattern by JMP or STATE_SAVE operations.
3222 // It must be one longer than the pattern being checked because some ops
3223 // will jmp to a end-of-block+1 location from within a block, and we must
3224 // count those when checking the block.
3225 UVector32 forwardedLength(end+2, *fStatus);
3226 forwardedLength.setSize(end+2);
3227 for (loc=start; loc<=end+1; loc++) {
3228 forwardedLength.setElementAt(INT32_MAX, loc);
3229 }
3230
3231 for (loc = start; loc<=end; loc++) {
3232 op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3233 opType = URX_TYPE(op);
3234
3235 // The loop is advancing linearly through the pattern.
3236 // If the op we are now at was the destination of a branch in the pattern,
3237 // and that path has a shorter minimum length than the current accumulated value,
3238 // replace the current accumulated value.
3239 // U_ASSERT(currentLen>=0 && currentLen < INT32_MAX); // MinLength == INT32_MAX for some
3240 // no-match-possible cases.
3241 if (forwardedLength.elementAti(loc) < currentLen) {
3242 currentLen = forwardedLength.elementAti(loc);
3243 U_ASSERT(currentLen>=0 && currentLen < INT32_MAX);
3244 }
3245
3246 switch (opType) {
3247 // Ops that don't change the total length matched
3248 case URX_RESERVED_OP:
3249 case URX_END:
3250 case URX_STRING_LEN:
3251 case URX_NOP:
3252 case URX_START_CAPTURE:
3253 case URX_END_CAPTURE:
3254 case URX_BACKSLASH_B:
3255 case URX_BACKSLASH_BU:
3256 case URX_BACKSLASH_G:
3257 case URX_BACKSLASH_Z:
3258 case URX_CARET:
3259 case URX_DOLLAR:
3260 case URX_DOLLAR_M:
3261 case URX_DOLLAR_D:
3262 case URX_DOLLAR_MD:
3263 case URX_RELOC_OPRND:
3264 case URX_STO_INP_LOC:
3265 case URX_CARET_M:
3266 case URX_CARET_M_UNIX:
3267 case URX_BACKREF: // BackRef. Must assume that it might be a zero length match
3268 case URX_BACKREF_I:
3269
3270 case URX_STO_SP: // Setup for atomic or possessive blocks. Doesn't change what can match.
3271 case URX_LD_SP:
3272
3273 case URX_JMP_SAV:
3274 case URX_JMP_SAV_X:
3275 break;
3276
3277
3278 // Ops that match a minimum of one character (one or two 16 bit code units.)
3279 //
3280 case URX_ONECHAR:
3281 case URX_STATIC_SETREF:
3282 case URX_STAT_SETREF_N:
3283 case URX_SETREF:
3284 case URX_BACKSLASH_D:
3285 case URX_BACKSLASH_H:
3286 case URX_BACKSLASH_R:
3287 case URX_BACKSLASH_V:
3288 case URX_ONECHAR_I:
3289 case URX_BACKSLASH_X: // Grahpeme Cluster. Minimum is 1, max unbounded.
3290 case URX_DOTANY_ALL: // . matches one or two.
3291 case URX_DOTANY:
3292 case URX_DOTANY_UNIX:
3293 currentLen = safeIncrement(currentLen, 1);
3294 break;
3295
3296
3297 case URX_JMPX:
3298 loc++; // URX_JMPX has an extra operand, ignored here,
3299 // otherwise processed identically to URX_JMP.
3300 U_FALLTHROUGH;
3301 case URX_JMP:
3302 {
3303 int32_t jmpDest = URX_VAL(op);
3304 if (jmpDest < loc) {
3305 // Loop of some kind. Can safely ignore, the worst that will happen
3306 // is that we understate the true minimum length
3307 currentLen = forwardedLength.elementAti(loc+1);
3308 } else {
3309 // Forward jump. Propagate the current min length to the target loc of the jump.
3310 U_ASSERT(jmpDest <= end+1);
3311 if (forwardedLength.elementAti(jmpDest) > currentLen) {
3312 forwardedLength.setElementAt(currentLen, jmpDest);
3313 }
3314 }
3315 }
3316 break;
3317
3318 case URX_BACKTRACK:
3319 {
3320 // Back-tracks are kind of like a branch, except that the min length was
3321 // propagated already, by the state save.
3322 currentLen = forwardedLength.elementAti(loc+1);
3323 }
3324 break;
3325
3326
3327 case URX_STATE_SAVE:
3328 {
3329 // State Save, for forward jumps, propagate the current minimum.
3330 // of the state save.
3331 int32_t jmpDest = URX_VAL(op);
3332 if (jmpDest > loc) {
3333 if (currentLen < forwardedLength.elementAti(jmpDest)) {
3334 forwardedLength.setElementAt(currentLen, jmpDest);
3335 }
3336 }
3337 }
3338 break;
3339
3340
3341 case URX_STRING:
3342 {
3343 loc++;
3344 int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3345 currentLen = safeIncrement(currentLen, URX_VAL(stringLenOp));
3346 }
3347 break;
3348
3349
3350 case URX_STRING_I:
3351 {
3352 loc++;
3353 // TODO: with full case folding, matching input text may be shorter than
3354 // the string we have here. More smarts could put some bounds on it.
3355 // Assume a min length of one for now. A min length of zero causes
3356 // optimization failures for a pattern like "string"+
3357 // currentLen += URX_VAL(stringLenOp);
3358 currentLen = safeIncrement(currentLen, 1);
3359 }
3360 break;
3361
3362 case URX_CTR_INIT:
3363 case URX_CTR_INIT_NG:
3364 {
3365 // Loop Init Ops.
3366 // If the min loop count == 0
3367 // move loc forwards to the end of the loop, skipping over the body.
3368 // If the min count is > 0,
3369 // continue normal processing of the body of the loop.
3370 int32_t loopEndLoc = (int32_t)fRXPat->fCompiledPat->elementAti(loc+1);
3371 loopEndLoc = URX_VAL(loopEndLoc);
3372 int32_t minLoopCount = (int32_t)fRXPat->fCompiledPat->elementAti(loc+2);
3373 if (minLoopCount == 0) {
3374 loc = loopEndLoc;
3375 } else {
3376 loc+=3; // Skips over operands of CTR_INIT
3377 }
3378 }
3379 break;
3380
3381
3382 case URX_CTR_LOOP:
3383 case URX_CTR_LOOP_NG:
3384 // Loop ops.
3385 // The jump is conditional, backwards only.
3386 break;
3387
3388 case URX_LOOP_SR_I:
3389 case URX_LOOP_DOT_I:
3390 case URX_LOOP_C:
3391 // More loop ops. These state-save to themselves.
3392 // don't change the minimum match - could match nothing at all.
3393 break;
3394
3395
3396 case URX_LA_START:
3397 case URX_LB_START:
3398 {
3399 // Look-around. Scan forward until the matching look-ahead end,
3400 // without processing the look-around block. This is overly pessimistic for look-ahead,
3401 // it assumes that the look-ahead match might be zero-length.
3402 // TODO: Positive lookahead could recursively do the block, then continue
3403 // with the longer of the block or the value coming in. Ticket 6060
3404 int32_t depth = (opType == URX_LA_START? 2: 1);
3405 for (;;) {
3406 loc++;
3407 op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3408 if (URX_TYPE(op) == URX_LA_START) {
3409 // The boilerplate for look-ahead includes two LA_END insturctions,
3410 // Depth will be decremented by each one when it is seen.
3411 depth += 2;
3412 }
3413 if (URX_TYPE(op) == URX_LB_START) {
3414 depth++;
3415 }
3416 if (URX_TYPE(op) == URX_LA_END) {
3417 depth--;
3418 if (depth == 0) {
3419 break;
3420 }
3421 }
3422 if (URX_TYPE(op)==URX_LBN_END) {
3423 depth--;
3424 if (depth == 0) {
3425 break;
3426 }
3427 }
3428 if (URX_TYPE(op) == URX_STATE_SAVE) {
3429 // Need this because neg lookahead blocks will FAIL to outside
3430 // of the block.
3431 int32_t jmpDest = URX_VAL(op);
3432 if (jmpDest > loc) {
3433 if (currentLen < forwardedLength.elementAti(jmpDest)) {
3434 forwardedLength.setElementAt(currentLen, jmpDest);
3435 }
3436 }
3437 }
3438 U_ASSERT(loc <= end);
3439 }
3440 }
3441 break;
3442
3443 case URX_LA_END:
3444 case URX_LB_CONT:
3445 case URX_LB_END:
3446 case URX_LBN_CONT:
3447 case URX_LBN_END:
3448 // Only come here if the matching URX_LA_START or URX_LB_START was not in the
3449 // range being sized, which happens when measuring size of look-behind blocks.
3450 break;
3451
3452 default:
3453 UPRV_UNREACHABLE;
3454 }
3455
3456 }
3457
3458 // We have finished walking through the ops. Check whether some forward jump
3459 // propagated a shorter length to location end+1.
3460 if (forwardedLength.elementAti(end+1) < currentLen) {
3461 currentLen = forwardedLength.elementAti(end+1);
3462 U_ASSERT(currentLen>=0 && currentLen < INT32_MAX);
3463 }
3464
3465 return currentLen;
3466 }
3467
3468 //------------------------------------------------------------------------------
3469 //
3470 // maxMatchLength Calculate the length of the longest string that could
3471 // match the specified pattern.
3472 // Length is in 16 bit code units, not code points.
3473 //
3474 // The calculated length may not be exact. The returned
3475 // value may be longer than the actual maximum; it must
3476 // never be shorter.
3477 //
3478 //------------------------------------------------------------------------------
maxMatchLength(int32_t start,int32_t end)3479 int32_t RegexCompile::maxMatchLength(int32_t start, int32_t end) {
3480 if (U_FAILURE(*fStatus)) {
3481 return 0;
3482 }
3483 U_ASSERT(start <= end);
3484 U_ASSERT(end < fRXPat->fCompiledPat->size());
3485
3486 int32_t loc;
3487 int32_t op;
3488 int32_t opType;
3489 int32_t currentLen = 0;
3490 UVector32 forwardedLength(end+1, *fStatus);
3491 forwardedLength.setSize(end+1);
3492
3493 for (loc=start; loc<=end; loc++) {
3494 forwardedLength.setElementAt(0, loc);
3495 }
3496
3497 for (loc = start; loc<=end; loc++) {
3498 op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3499 opType = URX_TYPE(op);
3500
3501 // The loop is advancing linearly through the pattern.
3502 // If the op we are now at was the destination of a branch in the pattern,
3503 // and that path has a longer maximum length than the current accumulated value,
3504 // replace the current accumulated value.
3505 if (forwardedLength.elementAti(loc) > currentLen) {
3506 currentLen = forwardedLength.elementAti(loc);
3507 }
3508
3509 switch (opType) {
3510 // Ops that don't change the total length matched
3511 case URX_RESERVED_OP:
3512 case URX_END:
3513 case URX_STRING_LEN:
3514 case URX_NOP:
3515 case URX_START_CAPTURE:
3516 case URX_END_CAPTURE:
3517 case URX_BACKSLASH_B:
3518 case URX_BACKSLASH_BU:
3519 case URX_BACKSLASH_G:
3520 case URX_BACKSLASH_Z:
3521 case URX_CARET:
3522 case URX_DOLLAR:
3523 case URX_DOLLAR_M:
3524 case URX_DOLLAR_D:
3525 case URX_DOLLAR_MD:
3526 case URX_RELOC_OPRND:
3527 case URX_STO_INP_LOC:
3528 case URX_CARET_M:
3529 case URX_CARET_M_UNIX:
3530
3531 case URX_STO_SP: // Setup for atomic or possessive blocks. Doesn't change what can match.
3532 case URX_LD_SP:
3533
3534 case URX_LB_END:
3535 case URX_LB_CONT:
3536 case URX_LBN_CONT:
3537 case URX_LBN_END:
3538 break;
3539
3540
3541 // Ops that increase that cause an unbounded increase in the length
3542 // of a matched string, or that increase it a hard to characterize way.
3543 // Call the max length unbounded, and stop further checking.
3544 case URX_BACKREF: // BackRef. Must assume that it might be a zero length match
3545 case URX_BACKREF_I:
3546 case URX_BACKSLASH_X: // Grahpeme Cluster. Minimum is 1, max unbounded.
3547 currentLen = INT32_MAX;
3548 break;
3549
3550
3551 // Ops that match a max of one character (possibly two 16 bit code units.)
3552 //
3553 case URX_STATIC_SETREF:
3554 case URX_STAT_SETREF_N:
3555 case URX_SETREF:
3556 case URX_BACKSLASH_D:
3557 case URX_BACKSLASH_H:
3558 case URX_BACKSLASH_R:
3559 case URX_BACKSLASH_V:
3560 case URX_ONECHAR_I:
3561 case URX_DOTANY_ALL:
3562 case URX_DOTANY:
3563 case URX_DOTANY_UNIX:
3564 currentLen = safeIncrement(currentLen, 2);
3565 break;
3566
3567 // Single literal character. Increase current max length by one or two,
3568 // depending on whether the char is in the supplementary range.
3569 case URX_ONECHAR:
3570 currentLen = safeIncrement(currentLen, 1);
3571 if (URX_VAL(op) > 0x10000) {
3572 currentLen = safeIncrement(currentLen, 1);
3573 }
3574 break;
3575
3576 // Jumps.
3577 //
3578 case URX_JMP:
3579 case URX_JMPX:
3580 case URX_JMP_SAV:
3581 case URX_JMP_SAV_X:
3582 {
3583 int32_t jmpDest = URX_VAL(op);
3584 if (jmpDest < loc) {
3585 // Loop of some kind. Max match length is unbounded.
3586 currentLen = INT32_MAX;
3587 } else {
3588 // Forward jump. Propagate the current min length to the target loc of the jump.
3589 if (forwardedLength.elementAti(jmpDest) < currentLen) {
3590 forwardedLength.setElementAt(currentLen, jmpDest);
3591 }
3592 currentLen = 0;
3593 }
3594 }
3595 break;
3596
3597 case URX_BACKTRACK:
3598 // back-tracks are kind of like a branch, except that the max length was
3599 // propagated already, by the state save.
3600 currentLen = forwardedLength.elementAti(loc+1);
3601 break;
3602
3603
3604 case URX_STATE_SAVE:
3605 {
3606 // State Save, for forward jumps, propagate the current minimum.
3607 // of the state save.
3608 // For backwards jumps, they create a loop, maximum
3609 // match length is unbounded.
3610 int32_t jmpDest = URX_VAL(op);
3611 if (jmpDest > loc) {
3612 if (currentLen > forwardedLength.elementAti(jmpDest)) {
3613 forwardedLength.setElementAt(currentLen, jmpDest);
3614 }
3615 } else {
3616 currentLen = INT32_MAX;
3617 }
3618 }
3619 break;
3620
3621
3622
3623
3624 case URX_STRING:
3625 {
3626 loc++;
3627 int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3628 currentLen = safeIncrement(currentLen, URX_VAL(stringLenOp));
3629 break;
3630 }
3631
3632 case URX_STRING_I:
3633 // TODO: This code assumes that any user string that matches will be no longer
3634 // than our compiled string, with case insensitive matching.
3635 // Our compiled string has been case-folded already.
3636 //
3637 // Any matching user string will have no more code points than our
3638 // compiled (folded) string. Folding may add code points, but
3639 // not remove them.
3640 //
3641 // There is a potential problem if a supplemental code point
3642 // case-folds to a BMP code point. In this case our compiled string
3643 // could be shorter (in code units) than a matching user string.
3644 //
3645 // At this time (Unicode 6.1) there are no such characters, and this case
3646 // is not being handled. A test, intltest regex/Bug9283, will fail if
3647 // any problematic characters are added to Unicode.
3648 //
3649 // If this happens, we can make a set of the BMP chars that the
3650 // troublesome supplementals fold to, scan our string, and bump the
3651 // currentLen one extra for each that is found.
3652 //
3653 {
3654 loc++;
3655 int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3656 currentLen = safeIncrement(currentLen, URX_VAL(stringLenOp));
3657 }
3658 break;
3659
3660 case URX_CTR_INIT:
3661 case URX_CTR_INIT_NG:
3662 // For Loops, recursively call this function on the pattern for the loop body,
3663 // then multiply the result by the maximum loop count.
3664 {
3665 int32_t loopEndLoc = URX_VAL(fRXPat->fCompiledPat->elementAti(loc+1));
3666 if (loopEndLoc == loc+4) {
3667 // Loop has an empty body. No affect on max match length.
3668 // Continue processing with code after the loop end.
3669 loc = loopEndLoc;
3670 break;
3671 }
3672
3673 int32_t maxLoopCount = static_cast<int32_t>(fRXPat->fCompiledPat->elementAti(loc+3));
3674 if (maxLoopCount == -1) {
3675 // Unbounded Loop. No upper bound on match length.
3676 currentLen = INT32_MAX;
3677 break;
3678 }
3679
3680 U_ASSERT(loopEndLoc >= loc+4);
3681 int64_t blockLen = maxMatchLength(loc+4, loopEndLoc-1); // Recursive call.
3682 int64_t updatedLen = (int64_t)currentLen + blockLen * maxLoopCount;
3683 if (updatedLen >= INT32_MAX) {
3684 currentLen = INT32_MAX;
3685 break;
3686 }
3687 currentLen = (int32_t)updatedLen;
3688 loc = loopEndLoc;
3689 break;
3690 }
3691
3692 case URX_CTR_LOOP:
3693 case URX_CTR_LOOP_NG:
3694 // These opcodes will be skipped over by code for URX_CTR_INIT.
3695 // We shouldn't encounter them here.
3696 UPRV_UNREACHABLE;
3697
3698 case URX_LOOP_SR_I:
3699 case URX_LOOP_DOT_I:
3700 case URX_LOOP_C:
3701 // For anything to do with loops, make the match length unbounded.
3702 currentLen = INT32_MAX;
3703 break;
3704
3705
3706
3707 case URX_LA_START:
3708 case URX_LA_END:
3709 // Look-ahead. Just ignore, treat the look-ahead block as if
3710 // it were normal pattern. Gives a too-long match length,
3711 // but good enough for now.
3712 break;
3713
3714 // End of look-ahead ops should always be consumed by the processing at
3715 // the URX_LA_START op.
3716 // UPRV_UNREACHABLE;
3717
3718 case URX_LB_START:
3719 {
3720 // Look-behind. Scan forward until the matching look-around end,
3721 // without processing the look-behind block.
3722 int32_t dataLoc = URX_VAL(op);
3723 for (loc = loc + 1; loc < end; ++loc) {
3724 op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3725 int32_t opType = URX_TYPE(op);
3726 if ((opType == URX_LA_END || opType == URX_LBN_END) && (URX_VAL(op) == dataLoc)) {
3727 break;
3728 }
3729 }
3730 U_ASSERT(loc < end);
3731 }
3732 break;
3733
3734 default:
3735 UPRV_UNREACHABLE;
3736 }
3737
3738
3739 if (currentLen == INT32_MAX) {
3740 // The maximum length is unbounded.
3741 // Stop further processing of the pattern.
3742 break;
3743 }
3744
3745 }
3746 return currentLen;
3747
3748 }
3749
3750
3751 //------------------------------------------------------------------------------
3752 //
3753 // stripNOPs Remove any NOP operations from the compiled pattern code.
3754 // Extra NOPs are inserted for some constructs during the initial
3755 // code generation to provide locations that may be patched later.
3756 // Many end up unneeded, and are removed by this function.
3757 //
3758 // In order to minimize the number of passes through the pattern,
3759 // back-reference fixup is also performed here (adjusting
3760 // back-reference operands to point to the correct frame offsets).
3761 //
3762 //------------------------------------------------------------------------------
stripNOPs()3763 void RegexCompile::stripNOPs() {
3764
3765 if (U_FAILURE(*fStatus)) {
3766 return;
3767 }
3768
3769 int32_t end = fRXPat->fCompiledPat->size();
3770 UVector32 deltas(end, *fStatus);
3771
3772 // Make a first pass over the code, computing the amount that things
3773 // will be offset at each location in the original code.
3774 int32_t loc;
3775 int32_t d = 0;
3776 for (loc=0; loc<end; loc++) {
3777 deltas.addElement(d, *fStatus);
3778 int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3779 if (URX_TYPE(op) == URX_NOP) {
3780 d++;
3781 }
3782 }
3783
3784 UnicodeString caseStringBuffer;
3785
3786 // Make a second pass over the code, removing the NOPs by moving following
3787 // code up, and patching operands that refer to code locations that
3788 // are being moved. The array of offsets from the first step is used
3789 // to compute the new operand values.
3790 int32_t src;
3791 int32_t dst = 0;
3792 for (src=0; src<end; src++) {
3793 int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(src);
3794 int32_t opType = URX_TYPE(op);
3795 switch (opType) {
3796 case URX_NOP:
3797 break;
3798
3799 case URX_STATE_SAVE:
3800 case URX_JMP:
3801 case URX_CTR_LOOP:
3802 case URX_CTR_LOOP_NG:
3803 case URX_RELOC_OPRND:
3804 case URX_JMPX:
3805 case URX_JMP_SAV:
3806 case URX_JMP_SAV_X:
3807 // These are instructions with operands that refer to code locations.
3808 {
3809 int32_t operandAddress = URX_VAL(op);
3810 U_ASSERT(operandAddress>=0 && operandAddress<deltas.size());
3811 int32_t fixedOperandAddress = operandAddress - deltas.elementAti(operandAddress);
3812 op = buildOp(opType, fixedOperandAddress);
3813 fRXPat->fCompiledPat->setElementAt(op, dst);
3814 dst++;
3815 break;
3816 }
3817
3818 case URX_BACKREF:
3819 case URX_BACKREF_I:
3820 {
3821 int32_t where = URX_VAL(op);
3822 if (where > fRXPat->fGroupMap->size()) {
3823 error(U_REGEX_INVALID_BACK_REF);
3824 break;
3825 }
3826 where = fRXPat->fGroupMap->elementAti(where-1);
3827 op = buildOp(opType, where);
3828 fRXPat->fCompiledPat->setElementAt(op, dst);
3829 dst++;
3830
3831 fRXPat->fNeedsAltInput = TRUE;
3832 break;
3833 }
3834 case URX_RESERVED_OP:
3835 case URX_RESERVED_OP_N:
3836 case URX_BACKTRACK:
3837 case URX_END:
3838 case URX_ONECHAR:
3839 case URX_STRING:
3840 case URX_STRING_LEN:
3841 case URX_START_CAPTURE:
3842 case URX_END_CAPTURE:
3843 case URX_STATIC_SETREF:
3844 case URX_STAT_SETREF_N:
3845 case URX_SETREF:
3846 case URX_DOTANY:
3847 case URX_FAIL:
3848 case URX_BACKSLASH_B:
3849 case URX_BACKSLASH_BU:
3850 case URX_BACKSLASH_G:
3851 case URX_BACKSLASH_X:
3852 case URX_BACKSLASH_Z:
3853 case URX_DOTANY_ALL:
3854 case URX_BACKSLASH_D:
3855 case URX_CARET:
3856 case URX_DOLLAR:
3857 case URX_CTR_INIT:
3858 case URX_CTR_INIT_NG:
3859 case URX_DOTANY_UNIX:
3860 case URX_STO_SP:
3861 case URX_LD_SP:
3862 case URX_STO_INP_LOC:
3863 case URX_LA_START:
3864 case URX_LA_END:
3865 case URX_ONECHAR_I:
3866 case URX_STRING_I:
3867 case URX_DOLLAR_M:
3868 case URX_CARET_M:
3869 case URX_CARET_M_UNIX:
3870 case URX_LB_START:
3871 case URX_LB_CONT:
3872 case URX_LB_END:
3873 case URX_LBN_CONT:
3874 case URX_LBN_END:
3875 case URX_LOOP_SR_I:
3876 case URX_LOOP_DOT_I:
3877 case URX_LOOP_C:
3878 case URX_DOLLAR_D:
3879 case URX_DOLLAR_MD:
3880 case URX_BACKSLASH_H:
3881 case URX_BACKSLASH_R:
3882 case URX_BACKSLASH_V:
3883 // These instructions are unaltered by the relocation.
3884 fRXPat->fCompiledPat->setElementAt(op, dst);
3885 dst++;
3886 break;
3887
3888 default:
3889 // Some op is unaccounted for.
3890 UPRV_UNREACHABLE;
3891 }
3892 }
3893
3894 fRXPat->fCompiledPat->setSize(dst);
3895 }
3896
3897
3898
3899
3900 //------------------------------------------------------------------------------
3901 //
3902 // Error Report a rule parse error.
3903 // Only report it if no previous error has been recorded.
3904 //
3905 //------------------------------------------------------------------------------
error(UErrorCode e)3906 void RegexCompile::error(UErrorCode e) {
3907 if (U_SUCCESS(*fStatus) || e == U_MEMORY_ALLOCATION_ERROR) {
3908 *fStatus = e;
3909 // Hmm. fParseErr (UParseError) line & offset fields are int32_t in public
3910 // API (see common/unicode/parseerr.h), while fLineNum and fCharNum are
3911 // int64_t. If the values of the latter are out of range for the former,
3912 // set them to the appropriate "field not supported" values.
3913 if (fLineNum > 0x7FFFFFFF) {
3914 fParseErr->line = 0;
3915 fParseErr->offset = -1;
3916 } else if (fCharNum > 0x7FFFFFFF) {
3917 fParseErr->line = (int32_t)fLineNum;
3918 fParseErr->offset = -1;
3919 } else {
3920 fParseErr->line = (int32_t)fLineNum;
3921 fParseErr->offset = (int32_t)fCharNum;
3922 }
3923
3924 UErrorCode status = U_ZERO_ERROR; // throwaway status for extracting context
3925
3926 // Fill in the context.
3927 // Note: extractBetween() pins supplied indicies to the string bounds.
3928 uprv_memset(fParseErr->preContext, 0, sizeof(fParseErr->preContext));
3929 uprv_memset(fParseErr->postContext, 0, sizeof(fParseErr->postContext));
3930 utext_extract(fRXPat->fPattern, fScanIndex-U_PARSE_CONTEXT_LEN+1, fScanIndex, fParseErr->preContext, U_PARSE_CONTEXT_LEN, &status);
3931 utext_extract(fRXPat->fPattern, fScanIndex, fScanIndex+U_PARSE_CONTEXT_LEN-1, fParseErr->postContext, U_PARSE_CONTEXT_LEN, &status);
3932 }
3933 }
3934
3935
3936 //
3937 // Assorted Unicode character constants.
3938 // Numeric because there is no portable way to enter them as literals.
3939 // (Think EBCDIC).
3940 //
3941 static const UChar chCR = 0x0d; // New lines, for terminating comments.
3942 static const UChar chLF = 0x0a; // Line Feed
3943 static const UChar chPound = 0x23; // '#', introduces a comment.
3944 static const UChar chDigit0 = 0x30; // '0'
3945 static const UChar chDigit7 = 0x37; // '9'
3946 static const UChar chColon = 0x3A; // ':'
3947 static const UChar chE = 0x45; // 'E'
3948 static const UChar chQ = 0x51; // 'Q'
3949 //static const UChar chN = 0x4E; // 'N'
3950 static const UChar chP = 0x50; // 'P'
3951 static const UChar chBackSlash = 0x5c; // '\' introduces a char escape
3952 //static const UChar chLBracket = 0x5b; // '['
3953 static const UChar chRBracket = 0x5d; // ']'
3954 static const UChar chUp = 0x5e; // '^'
3955 static const UChar chLowerP = 0x70;
3956 static const UChar chLBrace = 0x7b; // '{'
3957 static const UChar chRBrace = 0x7d; // '}'
3958 static const UChar chNEL = 0x85; // NEL newline variant
3959 static const UChar chLS = 0x2028; // Unicode Line Separator
3960
3961
3962 //------------------------------------------------------------------------------
3963 //
3964 // nextCharLL Low Level Next Char from the regex pattern.
3965 // Get a char from the string, keep track of input position
3966 // for error reporting.
3967 //
3968 //------------------------------------------------------------------------------
nextCharLL()3969 UChar32 RegexCompile::nextCharLL() {
3970 UChar32 ch;
3971
3972 if (fPeekChar != -1) {
3973 ch = fPeekChar;
3974 fPeekChar = -1;
3975 return ch;
3976 }
3977
3978 // assume we're already in the right place
3979 ch = UTEXT_NEXT32(fRXPat->fPattern);
3980 if (ch == U_SENTINEL) {
3981 return ch;
3982 }
3983
3984 if (ch == chCR ||
3985 ch == chNEL ||
3986 ch == chLS ||
3987 (ch == chLF && fLastChar != chCR)) {
3988 // Character is starting a new line. Bump up the line number, and
3989 // reset the column to 0.
3990 fLineNum++;
3991 fCharNum=0;
3992 }
3993 else {
3994 // Character is not starting a new line. Except in the case of a
3995 // LF following a CR, increment the column position.
3996 if (ch != chLF) {
3997 fCharNum++;
3998 }
3999 }
4000 fLastChar = ch;
4001 return ch;
4002 }
4003
4004 //------------------------------------------------------------------------------
4005 //
4006 // peekCharLL Low Level Character Scanning, sneak a peek at the next
4007 // character without actually getting it.
4008 //
4009 //------------------------------------------------------------------------------
peekCharLL()4010 UChar32 RegexCompile::peekCharLL() {
4011 if (fPeekChar == -1) {
4012 fPeekChar = nextCharLL();
4013 }
4014 return fPeekChar;
4015 }
4016
4017
4018 //------------------------------------------------------------------------------
4019 //
4020 // nextChar for pattern scanning. At this level, we handle stripping
4021 // out comments and processing some backslash character escapes.
4022 // The rest of the pattern grammar is handled at the next level up.
4023 //
4024 //------------------------------------------------------------------------------
nextChar(RegexPatternChar & c)4025 void RegexCompile::nextChar(RegexPatternChar &c) {
4026 tailRecursion:
4027 fScanIndex = UTEXT_GETNATIVEINDEX(fRXPat->fPattern);
4028 c.fChar = nextCharLL();
4029 c.fQuoted = FALSE;
4030
4031 if (fQuoteMode) {
4032 c.fQuoted = TRUE;
4033 if ((c.fChar==chBackSlash && peekCharLL()==chE && ((fModeFlags & UREGEX_LITERAL) == 0)) ||
4034 c.fChar == (UChar32)-1) {
4035 fQuoteMode = FALSE; // Exit quote mode,
4036 nextCharLL(); // discard the E
4037 // nextChar(c); // recurse to get the real next char
4038 goto tailRecursion; // Note: fuzz testing produced testcases that
4039 // resulted in stack overflow here.
4040 }
4041 }
4042 else if (fInBackslashQuote) {
4043 // The current character immediately follows a '\'
4044 // Don't check for any further escapes, just return it as-is.
4045 // Don't set c.fQuoted, because that would prevent the state machine from
4046 // dispatching on the character.
4047 fInBackslashQuote = FALSE;
4048 }
4049 else
4050 {
4051 // We are not in a \Q quoted region \E of the source.
4052 //
4053 if (fModeFlags & UREGEX_COMMENTS) {
4054 //
4055 // We are in free-spacing and comments mode.
4056 // Scan through any white space and comments, until we
4057 // reach a significant character or the end of inut.
4058 for (;;) {
4059 if (c.fChar == (UChar32)-1) {
4060 break; // End of Input
4061 }
4062 if (c.fChar == chPound && fEOLComments == TRUE) {
4063 // Start of a comment. Consume the rest of it, until EOF or a new line
4064 for (;;) {
4065 c.fChar = nextCharLL();
4066 if (c.fChar == (UChar32)-1 || // EOF
4067 c.fChar == chCR ||
4068 c.fChar == chLF ||
4069 c.fChar == chNEL ||
4070 c.fChar == chLS) {
4071 break;
4072 }
4073 }
4074 }
4075 // TODO: check what Java & Perl do with non-ASCII white spaces. Ticket 6061.
4076 if (PatternProps::isWhiteSpace(c.fChar) == FALSE) {
4077 break;
4078 }
4079 c.fChar = nextCharLL();
4080 }
4081 }
4082
4083 //
4084 // check for backslash escaped characters.
4085 //
4086 if (c.fChar == chBackSlash) {
4087 int64_t pos = UTEXT_GETNATIVEINDEX(fRXPat->fPattern);
4088 if (RegexStaticSets::gStaticSets->fUnescapeCharSet.contains(peekCharLL())) {
4089 //
4090 // A '\' sequence that is handled by ICU's standard unescapeAt function.
4091 // Includes \uxxxx, \n, \r, many others.
4092 // Return the single equivalent character.
4093 //
4094 nextCharLL(); // get & discard the peeked char.
4095 c.fQuoted = TRUE;
4096
4097 if (UTEXT_FULL_TEXT_IN_CHUNK(fRXPat->fPattern, fPatternLength)) {
4098 int32_t endIndex = (int32_t)pos;
4099 c.fChar = u_unescapeAt(uregex_ucstr_unescape_charAt, &endIndex, (int32_t)fPatternLength, (void *)fRXPat->fPattern->chunkContents);
4100
4101 if (endIndex == pos) {
4102 error(U_REGEX_BAD_ESCAPE_SEQUENCE);
4103 }
4104 fCharNum += endIndex - pos;
4105 UTEXT_SETNATIVEINDEX(fRXPat->fPattern, endIndex);
4106 } else {
4107 int32_t offset = 0;
4108 struct URegexUTextUnescapeCharContext context = U_REGEX_UTEXT_UNESCAPE_CONTEXT(fRXPat->fPattern);
4109
4110 UTEXT_SETNATIVEINDEX(fRXPat->fPattern, pos);
4111 c.fChar = u_unescapeAt(uregex_utext_unescape_charAt, &offset, INT32_MAX, &context);
4112
4113 if (offset == 0) {
4114 error(U_REGEX_BAD_ESCAPE_SEQUENCE);
4115 } else if (context.lastOffset == offset) {
4116 UTEXT_PREVIOUS32(fRXPat->fPattern);
4117 } else if (context.lastOffset != offset-1) {
4118 utext_moveIndex32(fRXPat->fPattern, offset - context.lastOffset - 1);
4119 }
4120 fCharNum += offset;
4121 }
4122 }
4123 else if (peekCharLL() == chDigit0) {
4124 // Octal Escape, using Java Regexp Conventions
4125 // which are \0 followed by 1-3 octal digits.
4126 // Different from ICU Unescape handling of Octal, which does not
4127 // require the leading 0.
4128 // Java also has the convention of only consuming 2 octal digits if
4129 // the three digit number would be > 0xff
4130 //
4131 c.fChar = 0;
4132 nextCharLL(); // Consume the initial 0.
4133 int index;
4134 for (index=0; index<3; index++) {
4135 int32_t ch = peekCharLL();
4136 if (ch<chDigit0 || ch>chDigit7) {
4137 if (index==0) {
4138 // \0 is not followed by any octal digits.
4139 error(U_REGEX_BAD_ESCAPE_SEQUENCE);
4140 }
4141 break;
4142 }
4143 c.fChar <<= 3;
4144 c.fChar += ch&7;
4145 if (c.fChar <= 255) {
4146 nextCharLL();
4147 } else {
4148 // The last digit made the number too big. Forget we saw it.
4149 c.fChar >>= 3;
4150 }
4151 }
4152 c.fQuoted = TRUE;
4153 }
4154 else if (peekCharLL() == chQ) {
4155 // "\Q" enter quote mode, which will continue until "\E"
4156 fQuoteMode = TRUE;
4157 nextCharLL(); // discard the 'Q'.
4158 // nextChar(c); // recurse to get the real next char.
4159 goto tailRecursion; // Note: fuzz testing produced test cases that
4160 // resulted in stack overflow here.
4161 }
4162 else
4163 {
4164 // We are in a '\' escape that will be handled by the state table scanner.
4165 // Just return the backslash, but remember that the following char is to
4166 // be taken literally.
4167 fInBackslashQuote = TRUE;
4168 }
4169 }
4170 }
4171
4172 // re-enable # to end-of-line comments, in case they were disabled.
4173 // They are disabled by the parser upon seeing '(?', but this lasts for
4174 // the fetching of the next character only.
4175 fEOLComments = TRUE;
4176
4177 // putc(c.fChar, stdout);
4178 }
4179
4180
4181
4182 //------------------------------------------------------------------------------
4183 //
4184 // scanNamedChar
4185 // Get a UChar32 from a \N{UNICODE CHARACTER NAME} in the pattern.
4186 //
4187 // The scan position will be at the 'N'. On return
4188 // the scan position should be just after the '}'
4189 //
4190 // Return the UChar32
4191 //
4192 //------------------------------------------------------------------------------
scanNamedChar()4193 UChar32 RegexCompile::scanNamedChar() {
4194 if (U_FAILURE(*fStatus)) {
4195 return 0;
4196 }
4197
4198 nextChar(fC);
4199 if (fC.fChar != chLBrace) {
4200 error(U_REGEX_PROPERTY_SYNTAX);
4201 return 0;
4202 }
4203
4204 UnicodeString charName;
4205 for (;;) {
4206 nextChar(fC);
4207 if (fC.fChar == chRBrace) {
4208 break;
4209 }
4210 if (fC.fChar == -1) {
4211 error(U_REGEX_PROPERTY_SYNTAX);
4212 return 0;
4213 }
4214 charName.append(fC.fChar);
4215 }
4216
4217 char name[100];
4218 if (!uprv_isInvariantUString(charName.getBuffer(), charName.length()) ||
4219 (uint32_t)charName.length()>=sizeof(name)) {
4220 // All Unicode character names have only invariant characters.
4221 // The API to get a character, given a name, accepts only char *, forcing us to convert,
4222 // which requires this error check
4223 error(U_REGEX_PROPERTY_SYNTAX);
4224 return 0;
4225 }
4226 charName.extract(0, charName.length(), name, sizeof(name), US_INV);
4227
4228 UChar32 theChar = u_charFromName(U_UNICODE_CHAR_NAME, name, fStatus);
4229 if (U_FAILURE(*fStatus)) {
4230 error(U_REGEX_PROPERTY_SYNTAX);
4231 }
4232
4233 nextChar(fC); // Continue overall regex pattern processing with char after the '}'
4234 return theChar;
4235 }
4236
4237 //------------------------------------------------------------------------------
4238 //
4239 // scanProp Construct a UnicodeSet from the text at the current scan
4240 // position, which will be of the form \p{whaterver}
4241 //
4242 // The scan position will be at the 'p' or 'P'. On return
4243 // the scan position should be just after the '}'
4244 //
4245 // Return a UnicodeSet, constructed from the \P pattern,
4246 // or NULL if the pattern is invalid.
4247 //
4248 //------------------------------------------------------------------------------
scanProp()4249 UnicodeSet *RegexCompile::scanProp() {
4250 UnicodeSet *uset = NULL;
4251
4252 if (U_FAILURE(*fStatus)) {
4253 return NULL;
4254 }
4255 (void)chLowerP; // Suppress compiler unused variable warning.
4256 U_ASSERT(fC.fChar == chLowerP || fC.fChar == chP);
4257 UBool negated = (fC.fChar == chP);
4258
4259 UnicodeString propertyName;
4260 nextChar(fC);
4261 if (fC.fChar != chLBrace) {
4262 error(U_REGEX_PROPERTY_SYNTAX);
4263 return NULL;
4264 }
4265 for (;;) {
4266 nextChar(fC);
4267 if (fC.fChar == chRBrace) {
4268 break;
4269 }
4270 if (fC.fChar == -1) {
4271 // Hit the end of the input string without finding the closing '}'
4272 error(U_REGEX_PROPERTY_SYNTAX);
4273 return NULL;
4274 }
4275 propertyName.append(fC.fChar);
4276 }
4277 uset = createSetForProperty(propertyName, negated);
4278 nextChar(fC); // Move input scan to position following the closing '}'
4279 return uset;
4280 }
4281
4282 //------------------------------------------------------------------------------
4283 //
4284 // scanPosixProp Construct a UnicodeSet from the text at the current scan
4285 // position, which is expected be of the form [:property expression:]
4286 //
4287 // The scan position will be at the opening ':'. On return
4288 // the scan position must be on the closing ']'
4289 //
4290 // Return a UnicodeSet constructed from the pattern,
4291 // or NULL if this is not a valid POSIX-style set expression.
4292 // If not a property expression, restore the initial scan position
4293 // (to the opening ':')
4294 //
4295 // Note: the opening '[:' is not sufficient to guarantee that
4296 // this is a [:property:] expression.
4297 // [:'+=,] is a perfectly good ordinary set expression that
4298 // happens to include ':' as one of its characters.
4299 //
4300 //------------------------------------------------------------------------------
scanPosixProp()4301 UnicodeSet *RegexCompile::scanPosixProp() {
4302 UnicodeSet *uset = NULL;
4303
4304 if (U_FAILURE(*fStatus)) {
4305 return NULL;
4306 }
4307
4308 U_ASSERT(fC.fChar == chColon);
4309
4310 // Save the scanner state.
4311 // TODO: move this into the scanner, with the state encapsulated in some way. Ticket 6062
4312 int64_t savedScanIndex = fScanIndex;
4313 int64_t savedNextIndex = UTEXT_GETNATIVEINDEX(fRXPat->fPattern);
4314 UBool savedQuoteMode = fQuoteMode;
4315 UBool savedInBackslashQuote = fInBackslashQuote;
4316 UBool savedEOLComments = fEOLComments;
4317 int64_t savedLineNum = fLineNum;
4318 int64_t savedCharNum = fCharNum;
4319 UChar32 savedLastChar = fLastChar;
4320 UChar32 savedPeekChar = fPeekChar;
4321 RegexPatternChar savedfC = fC;
4322
4323 // Scan for a closing ]. A little tricky because there are some perverse
4324 // edge cases possible. "[:abc\Qdef:] \E]" is a valid non-property expression,
4325 // ending on the second closing ].
4326
4327 UnicodeString propName;
4328 UBool negated = FALSE;
4329
4330 // Check for and consume the '^' in a negated POSIX property, e.g. [:^Letter:]
4331 nextChar(fC);
4332 if (fC.fChar == chUp) {
4333 negated = TRUE;
4334 nextChar(fC);
4335 }
4336
4337 // Scan for the closing ":]", collecting the property name along the way.
4338 UBool sawPropSetTerminator = FALSE;
4339 for (;;) {
4340 propName.append(fC.fChar);
4341 nextChar(fC);
4342 if (fC.fQuoted || fC.fChar == -1) {
4343 // Escaped characters or end of input - either says this isn't a [:Property:]
4344 break;
4345 }
4346 if (fC.fChar == chColon) {
4347 nextChar(fC);
4348 if (fC.fChar == chRBracket) {
4349 sawPropSetTerminator = TRUE;
4350 }
4351 break;
4352 }
4353 }
4354
4355 if (sawPropSetTerminator) {
4356 uset = createSetForProperty(propName, negated);
4357 }
4358 else
4359 {
4360 // No closing ":]".
4361 // Restore the original scan position.
4362 // The main scanner will retry the input as a normal set expression,
4363 // not a [:Property:] expression.
4364 fScanIndex = savedScanIndex;
4365 fQuoteMode = savedQuoteMode;
4366 fInBackslashQuote = savedInBackslashQuote;
4367 fEOLComments = savedEOLComments;
4368 fLineNum = savedLineNum;
4369 fCharNum = savedCharNum;
4370 fLastChar = savedLastChar;
4371 fPeekChar = savedPeekChar;
4372 fC = savedfC;
4373 UTEXT_SETNATIVEINDEX(fRXPat->fPattern, savedNextIndex);
4374 }
4375 return uset;
4376 }
4377
addIdentifierIgnorable(UnicodeSet * set,UErrorCode & ec)4378 static inline void addIdentifierIgnorable(UnicodeSet *set, UErrorCode& ec) {
4379 set->add(0, 8).add(0x0e, 0x1b).add(0x7f, 0x9f);
4380 addCategory(set, U_GC_CF_MASK, ec);
4381 }
4382
4383 //
4384 // Create a Unicode Set from a Unicode Property expression.
4385 // This is common code underlying both \p{...} ane [:...:] expressions.
4386 // Includes trying the Java "properties" that aren't supported as
4387 // normal ICU UnicodeSet properties
4388 //
createSetForProperty(const UnicodeString & propName,UBool negated)4389 UnicodeSet *RegexCompile::createSetForProperty(const UnicodeString &propName, UBool negated) {
4390
4391 if (U_FAILURE(*fStatus)) {
4392 return nullptr;
4393 }
4394 LocalPointer<UnicodeSet> set;
4395 UErrorCode status = U_ZERO_ERROR;
4396
4397 do { // non-loop, exists to allow breaks from the block.
4398 //
4399 // First try the property as we received it
4400 //
4401 UnicodeString setExpr;
4402 uint32_t usetFlags = 0;
4403 setExpr.append(u"[\\p{", -1);
4404 setExpr.append(propName);
4405 setExpr.append(u"}]", -1);
4406 if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
4407 usetFlags |= USET_CASE_INSENSITIVE;
4408 }
4409 set.adoptInsteadAndCheckErrorCode(new UnicodeSet(setExpr, usetFlags, NULL, status), status);
4410 if (U_SUCCESS(status) || status == U_MEMORY_ALLOCATION_ERROR) {
4411 break;
4412 }
4413
4414 //
4415 // The incoming property wasn't directly recognized by ICU.
4416
4417 // Check [:word:] and [:all:]. These are not recognized as a properties by ICU UnicodeSet.
4418 // Java accepts 'word' with mixed case.
4419 // Java accepts 'all' only in all lower case.
4420
4421 status = U_ZERO_ERROR;
4422 if (propName.caseCompare(u"word", -1, 0) == 0) {
4423 set.adoptInsteadAndCheckErrorCode(
4424 RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET].cloneAsThawed(), status);
4425 break;
4426 }
4427 if (propName.compare(u"all", -1) == 0) {
4428 set.adoptInsteadAndCheckErrorCode(new UnicodeSet(0, 0x10ffff), status);
4429 break;
4430 }
4431
4432
4433 // Do Java InBlock expressions
4434 //
4435 UnicodeString mPropName = propName;
4436 if (mPropName.startsWith(u"In", 2) && mPropName.length() >= 3) {
4437 status = U_ZERO_ERROR;
4438 set.adoptInsteadAndCheckErrorCode(new UnicodeSet(), status);
4439 if (U_FAILURE(status)) {
4440 break;
4441 }
4442 UnicodeString blockName(mPropName, 2); // Property with the leading "In" removed.
4443 set->applyPropertyAlias(UnicodeString(u"Block"), blockName, status);
4444 break;
4445 }
4446
4447 // Check for the Java form "IsBooleanPropertyValue", which we will recast
4448 // as "BooleanPropertyValue". The property value can be either a
4449 // a General Category or a Script Name.
4450
4451 if (propName.startsWith(u"Is", 2) && propName.length()>=3) {
4452 mPropName.remove(0, 2); // Strip the "Is"
4453 if (mPropName.indexOf(u'=') >= 0) {
4454 // Reject any "Is..." property expression containing an '=', that is,
4455 // any non-binary property expression.
4456 status = U_REGEX_PROPERTY_SYNTAX;
4457 break;
4458 }
4459
4460 if (mPropName.caseCompare(u"assigned", -1, 0) == 0) {
4461 mPropName.setTo(u"unassigned", -1);
4462 negated = !negated;
4463 } else if (mPropName.caseCompare(u"TitleCase", -1, 0) == 0) {
4464 mPropName.setTo(u"Titlecase_Letter", -1);
4465 }
4466
4467 mPropName.insert(0, u"[\\p{", -1);
4468 mPropName.append(u"}]", -1);
4469 set.adoptInsteadAndCheckErrorCode(new UnicodeSet(mPropName, *fStatus), status);
4470
4471 if (U_SUCCESS(status) && !set->isEmpty() && (usetFlags & USET_CASE_INSENSITIVE)) {
4472 set->closeOver(USET_CASE_INSENSITIVE);
4473 }
4474 break;
4475
4476 }
4477
4478 if (propName.startsWith(u"java", -1)) {
4479 status = U_ZERO_ERROR;
4480 set.adoptInsteadAndCheckErrorCode(new UnicodeSet(), status);
4481 if (U_FAILURE(status)) {
4482 break;
4483 }
4484 //
4485 // Try the various Java specific properties.
4486 // These all begin with "java"
4487 //
4488 if (propName.compare(u"javaDefined", -1) == 0) {
4489 addCategory(set.getAlias(), U_GC_CN_MASK, status);
4490 set->complement();
4491 }
4492 else if (propName.compare(u"javaDigit", -1) == 0) {
4493 addCategory(set.getAlias(), U_GC_ND_MASK, status);
4494 }
4495 else if (propName.compare(u"javaIdentifierIgnorable", -1) == 0) {
4496 addIdentifierIgnorable(set.getAlias(), status);
4497 }
4498 else if (propName.compare(u"javaISOControl", -1) == 0) {
4499 set->add(0, 0x1F).add(0x7F, 0x9F);
4500 }
4501 else if (propName.compare(u"javaJavaIdentifierPart", -1) == 0) {
4502 addCategory(set.getAlias(), U_GC_L_MASK, status);
4503 addCategory(set.getAlias(), U_GC_SC_MASK, status);
4504 addCategory(set.getAlias(), U_GC_PC_MASK, status);
4505 addCategory(set.getAlias(), U_GC_ND_MASK, status);
4506 addCategory(set.getAlias(), U_GC_NL_MASK, status);
4507 addCategory(set.getAlias(), U_GC_MC_MASK, status);
4508 addCategory(set.getAlias(), U_GC_MN_MASK, status);
4509 addIdentifierIgnorable(set.getAlias(), status);
4510 }
4511 else if (propName.compare(u"javaJavaIdentifierStart", -1) == 0) {
4512 addCategory(set.getAlias(), U_GC_L_MASK, status);
4513 addCategory(set.getAlias(), U_GC_NL_MASK, status);
4514 addCategory(set.getAlias(), U_GC_SC_MASK, status);
4515 addCategory(set.getAlias(), U_GC_PC_MASK, status);
4516 }
4517 else if (propName.compare(u"javaLetter", -1) == 0) {
4518 addCategory(set.getAlias(), U_GC_L_MASK, status);
4519 }
4520 else if (propName.compare(u"javaLetterOrDigit", -1) == 0) {
4521 addCategory(set.getAlias(), U_GC_L_MASK, status);
4522 addCategory(set.getAlias(), U_GC_ND_MASK, status);
4523 }
4524 else if (propName.compare(u"javaLowerCase", -1) == 0) {
4525 addCategory(set.getAlias(), U_GC_LL_MASK, status);
4526 }
4527 else if (propName.compare(u"javaMirrored", -1) == 0) {
4528 set->applyIntPropertyValue(UCHAR_BIDI_MIRRORED, 1, status);
4529 }
4530 else if (propName.compare(u"javaSpaceChar", -1) == 0) {
4531 addCategory(set.getAlias(), U_GC_Z_MASK, status);
4532 }
4533 else if (propName.compare(u"javaSupplementaryCodePoint", -1) == 0) {
4534 set->add(0x10000, UnicodeSet::MAX_VALUE);
4535 }
4536 else if (propName.compare(u"javaTitleCase", -1) == 0) {
4537 addCategory(set.getAlias(), U_GC_LT_MASK, status);
4538 }
4539 else if (propName.compare(u"javaUnicodeIdentifierStart", -1) == 0) {
4540 addCategory(set.getAlias(), U_GC_L_MASK, status);
4541 addCategory(set.getAlias(), U_GC_NL_MASK, status);
4542 }
4543 else if (propName.compare(u"javaUnicodeIdentifierPart", -1) == 0) {
4544 addCategory(set.getAlias(), U_GC_L_MASK, status);
4545 addCategory(set.getAlias(), U_GC_PC_MASK, status);
4546 addCategory(set.getAlias(), U_GC_ND_MASK, status);
4547 addCategory(set.getAlias(), U_GC_NL_MASK, status);
4548 addCategory(set.getAlias(), U_GC_MC_MASK, status);
4549 addCategory(set.getAlias(), U_GC_MN_MASK, status);
4550 addIdentifierIgnorable(set.getAlias(), status);
4551 }
4552 else if (propName.compare(u"javaUpperCase", -1) == 0) {
4553 addCategory(set.getAlias(), U_GC_LU_MASK, status);
4554 }
4555 else if (propName.compare(u"javaValidCodePoint", -1) == 0) {
4556 set->add(0, UnicodeSet::MAX_VALUE);
4557 }
4558 else if (propName.compare(u"javaWhitespace", -1) == 0) {
4559 addCategory(set.getAlias(), U_GC_Z_MASK, status);
4560 set->removeAll(UnicodeSet().add(0xa0).add(0x2007).add(0x202f));
4561 set->add(9, 0x0d).add(0x1c, 0x1f);
4562 } else {
4563 status = U_REGEX_PROPERTY_SYNTAX;
4564 }
4565
4566 if (U_SUCCESS(status) && !set->isEmpty() && (usetFlags & USET_CASE_INSENSITIVE)) {
4567 set->closeOver(USET_CASE_INSENSITIVE);
4568 }
4569 break;
4570 }
4571
4572 // Unrecognized property. ICU didn't like it as it was, and none of the Java compatibility
4573 // extensions matched it.
4574 status = U_REGEX_PROPERTY_SYNTAX;
4575 } while (false); // End of do loop block. Code above breaks out of the block on success or hard failure.
4576
4577 if (U_SUCCESS(status)) {
4578 U_ASSERT(set.isValid());
4579 if (negated) {
4580 set->complement();
4581 }
4582 return set.orphan();
4583 } else {
4584 if (status == U_ILLEGAL_ARGUMENT_ERROR) {
4585 status = U_REGEX_PROPERTY_SYNTAX;
4586 }
4587 error(status);
4588 return nullptr;
4589 }
4590 }
4591
4592
4593 //
4594 // SetEval Part of the evaluation of [set expressions].
4595 // Perform any pending (stacked) operations with precedence
4596 // equal or greater to that of the next operator encountered
4597 // in the expression.
4598 //
setEval(int32_t nextOp)4599 void RegexCompile::setEval(int32_t nextOp) {
4600 UnicodeSet *rightOperand = NULL;
4601 UnicodeSet *leftOperand = NULL;
4602 for (;;) {
4603 U_ASSERT(fSetOpStack.empty()==FALSE);
4604 int32_t pendingSetOperation = fSetOpStack.peeki();
4605 if ((pendingSetOperation&0xffff0000) < (nextOp&0xffff0000)) {
4606 break;
4607 }
4608 fSetOpStack.popi();
4609 U_ASSERT(fSetStack.empty() == FALSE);
4610 rightOperand = (UnicodeSet *)fSetStack.peek();
4611 switch (pendingSetOperation) {
4612 case setNegation:
4613 rightOperand->complement();
4614 break;
4615 case setCaseClose:
4616 // TODO: need a simple close function. Ticket 6065
4617 rightOperand->closeOver(USET_CASE_INSENSITIVE);
4618 rightOperand->removeAllStrings();
4619 break;
4620 case setDifference1:
4621 case setDifference2:
4622 fSetStack.pop();
4623 leftOperand = (UnicodeSet *)fSetStack.peek();
4624 leftOperand->removeAll(*rightOperand);
4625 delete rightOperand;
4626 break;
4627 case setIntersection1:
4628 case setIntersection2:
4629 fSetStack.pop();
4630 leftOperand = (UnicodeSet *)fSetStack.peek();
4631 leftOperand->retainAll(*rightOperand);
4632 delete rightOperand;
4633 break;
4634 case setUnion:
4635 fSetStack.pop();
4636 leftOperand = (UnicodeSet *)fSetStack.peek();
4637 leftOperand->addAll(*rightOperand);
4638 delete rightOperand;
4639 break;
4640 default:
4641 UPRV_UNREACHABLE;
4642 }
4643 }
4644 }
4645
setPushOp(int32_t op)4646 void RegexCompile::setPushOp(int32_t op) {
4647 setEval(op);
4648 fSetOpStack.push(op, *fStatus);
4649 fSetStack.push(new UnicodeSet(), *fStatus);
4650 }
4651
4652 U_NAMESPACE_END
4653 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS
4654
4655