1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 //
4 // file: regexcmp.cpp
5 //
6 // Copyright (C) 2002-2016 International Business Machines Corporation and others.
7 // All Rights Reserved.
8 //
9 // This file contains the ICU regular expression compiler, which is responsible
10 // for processing a regular expression pattern into the compiled form that
11 // is used by the match finding engine.
12 //
13
14 #include "unicode/utypes.h"
15
16 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
17
18 #include "unicode/ustring.h"
19 #include "unicode/unistr.h"
20 #include "unicode/uniset.h"
21 #include "unicode/uchar.h"
22 #include "unicode/uchriter.h"
23 #include "unicode/parsepos.h"
24 #include "unicode/parseerr.h"
25 #include "unicode/regex.h"
26 #include "unicode/utf.h"
27 #include "unicode/utf16.h"
28 #include "patternprops.h"
29 #include "putilimp.h"
30 #include "cmemory.h"
31 #include "cstring.h"
32 #include "uvectr32.h"
33 #include "uvectr64.h"
34 #include "uassert.h"
35 #include "uinvchar.h"
36
37 #include "regeximp.h"
38 #include "regexcst.h" // Contains state table for the regex pattern parser.
39 // generated by a Perl script.
40 #include "regexcmp.h"
41 #include "regexst.h"
42 #include "regextxt.h"
43
44
45
46 U_NAMESPACE_BEGIN
47
48
49 //------------------------------------------------------------------------------
50 //
51 // Constructor.
52 //
53 //------------------------------------------------------------------------------
RegexCompile(RegexPattern * rxp,UErrorCode & status)54 RegexCompile::RegexCompile(RegexPattern *rxp, UErrorCode &status) :
55 fParenStack(status), fSetStack(status), fSetOpStack(status)
56 {
57 // Lazy init of all shared global sets (needed for init()'s empty text)
58 RegexStaticSets::initGlobals(&status);
59
60 fStatus = &status;
61
62 fRXPat = rxp;
63 fScanIndex = 0;
64 fLastChar = -1;
65 fPeekChar = -1;
66 fLineNum = 1;
67 fCharNum = 0;
68 fQuoteMode = FALSE;
69 fInBackslashQuote = FALSE;
70 fModeFlags = fRXPat->fFlags | 0x80000000;
71 fEOLComments = TRUE;
72
73 fMatchOpenParen = -1;
74 fMatchCloseParen = -1;
75 fCaptureName = NULL;
76 fLastSetLiteral = U_SENTINEL;
77
78 if (U_SUCCESS(status) && U_FAILURE(rxp->fDeferredStatus)) {
79 status = rxp->fDeferredStatus;
80 }
81 }
82
83 static const UChar chAmp = 0x26; // '&'
84 static const UChar chDash = 0x2d; // '-'
85
86
87 //------------------------------------------------------------------------------
88 //
89 // Destructor
90 //
91 //------------------------------------------------------------------------------
~RegexCompile()92 RegexCompile::~RegexCompile() {
93 delete fCaptureName; // Normally will be NULL, but can exist if pattern
94 // compilation stops with a syntax error.
95 }
96
addCategory(UnicodeSet * set,int32_t value,UErrorCode & ec)97 static inline void addCategory(UnicodeSet *set, int32_t value, UErrorCode& ec) {
98 set->addAll(UnicodeSet().applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, value, ec));
99 }
100
101 //------------------------------------------------------------------------------
102 //
103 // Compile regex pattern. The state machine for rexexp pattern parsing is here.
104 // The state tables are hand-written in the file regexcst.txt,
105 // and converted to the form used here by a perl
106 // script regexcst.pl
107 //
108 //------------------------------------------------------------------------------
compile(const UnicodeString & pat,UParseError & pp,UErrorCode & e)109 void RegexCompile::compile(
110 const UnicodeString &pat, // Source pat to be compiled.
111 UParseError &pp, // Error position info
112 UErrorCode &e) // Error Code
113 {
114 fRXPat->fPatternString = new UnicodeString(pat);
115 UText patternText = UTEXT_INITIALIZER;
116 utext_openConstUnicodeString(&patternText, fRXPat->fPatternString, &e);
117
118 if (U_SUCCESS(e)) {
119 compile(&patternText, pp, e);
120 utext_close(&patternText);
121 }
122 }
123
124 //
125 // compile, UText mode
126 // All the work is actually done here.
127 //
compile(UText * pat,UParseError & pp,UErrorCode & e)128 void RegexCompile::compile(
129 UText *pat, // Source pat to be compiled.
130 UParseError &pp, // Error position info
131 UErrorCode &e) // Error Code
132 {
133 fStatus = &e;
134 fParseErr = &pp;
135 fStackPtr = 0;
136 fStack[fStackPtr] = 0;
137
138 if (U_FAILURE(*fStatus)) {
139 return;
140 }
141
142 // There should be no pattern stuff in the RegexPattern object. They can not be reused.
143 U_ASSERT(fRXPat->fPattern == NULL || utext_nativeLength(fRXPat->fPattern) == 0);
144
145 // Prepare the RegexPattern object to receive the compiled pattern.
146 fRXPat->fPattern = utext_clone(fRXPat->fPattern, pat, FALSE, TRUE, fStatus);
147 if (U_FAILURE(*fStatus)) {
148 return;
149 }
150 fRXPat->fStaticSets = RegexStaticSets::gStaticSets->fPropSets;
151 fRXPat->fStaticSets8 = RegexStaticSets::gStaticSets->fPropSets8;
152
153
154 // Initialize the pattern scanning state machine
155 fPatternLength = utext_nativeLength(pat);
156 uint16_t state = 1;
157 const RegexTableEl *tableEl;
158
159 // UREGEX_LITERAL force entire pattern to be treated as a literal string.
160 if (fModeFlags & UREGEX_LITERAL) {
161 fQuoteMode = TRUE;
162 }
163
164 nextChar(fC); // Fetch the first char from the pattern string.
165
166 //
167 // Main loop for the regex pattern parsing state machine.
168 // Runs once per state transition.
169 // Each time through optionally performs, depending on the state table,
170 // - an advance to the the next pattern char
171 // - an action to be performed.
172 // - pushing or popping a state to/from the local state return stack.
173 // file regexcst.txt is the source for the state table. The logic behind
174 // recongizing the pattern syntax is there, not here.
175 //
176 for (;;) {
177 // Bail out if anything has gone wrong.
178 // Regex pattern parsing stops on the first error encountered.
179 if (U_FAILURE(*fStatus)) {
180 break;
181 }
182
183 U_ASSERT(state != 0);
184
185 // Find the state table element that matches the input char from the pattern, or the
186 // class of the input character. Start with the first table row for this
187 // state, then linearly scan forward until we find a row that matches the
188 // character. The last row for each state always matches all characters, so
189 // the search will stop there, if not before.
190 //
191 tableEl = &gRuleParseStateTable[state];
192 REGEX_SCAN_DEBUG_PRINTF(("char, line, col = (\'%c\', %d, %d) state=%s ",
193 fC.fChar, fLineNum, fCharNum, RegexStateNames[state]));
194
195 for (;;) { // loop through table rows belonging to this state, looking for one
196 // that matches the current input char.
197 REGEX_SCAN_DEBUG_PRINTF(("."));
198 if (tableEl->fCharClass < 127 && fC.fQuoted == FALSE && tableEl->fCharClass == fC.fChar) {
199 // Table row specified an individual character, not a set, and
200 // the input character is not quoted, and
201 // the input character matched it.
202 break;
203 }
204 if (tableEl->fCharClass == 255) {
205 // Table row specified default, match anything character class.
206 break;
207 }
208 if (tableEl->fCharClass == 254 && fC.fQuoted) {
209 // Table row specified "quoted" and the char was quoted.
210 break;
211 }
212 if (tableEl->fCharClass == 253 && fC.fChar == (UChar32)-1) {
213 // Table row specified eof and we hit eof on the input.
214 break;
215 }
216
217 if (tableEl->fCharClass >= 128 && tableEl->fCharClass < 240 && // Table specs a char class &&
218 fC.fQuoted == FALSE && // char is not escaped &&
219 fC.fChar != (UChar32)-1) { // char is not EOF
220 U_ASSERT(tableEl->fCharClass <= 137);
221 if (RegexStaticSets::gStaticSets->fRuleSets[tableEl->fCharClass-128].contains(fC.fChar)) {
222 // Table row specified a character class, or set of characters,
223 // and the current char matches it.
224 break;
225 }
226 }
227
228 // No match on this row, advance to the next row for this state,
229 tableEl++;
230 }
231 REGEX_SCAN_DEBUG_PRINTF(("\n"));
232
233 //
234 // We've found the row of the state table that matches the current input
235 // character from the rules string.
236 // Perform any action specified by this row in the state table.
237 if (doParseActions(tableEl->fAction) == FALSE) {
238 // Break out of the state machine loop if the
239 // the action signalled some kind of error, or
240 // the action was to exit, occurs on normal end-of-rules-input.
241 break;
242 }
243
244 if (tableEl->fPushState != 0) {
245 fStackPtr++;
246 if (fStackPtr >= kStackSize) {
247 error(U_REGEX_INTERNAL_ERROR);
248 REGEX_SCAN_DEBUG_PRINTF(("RegexCompile::parse() - state stack overflow.\n"));
249 fStackPtr--;
250 }
251 fStack[fStackPtr] = tableEl->fPushState;
252 }
253
254 //
255 // NextChar. This is where characters are actually fetched from the pattern.
256 // Happens under control of the 'n' tag in the state table.
257 //
258 if (tableEl->fNextChar) {
259 nextChar(fC);
260 }
261
262 // Get the next state from the table entry, or from the
263 // state stack if the next state was specified as "pop".
264 if (tableEl->fNextState != 255) {
265 state = tableEl->fNextState;
266 } else {
267 state = fStack[fStackPtr];
268 fStackPtr--;
269 if (fStackPtr < 0) {
270 // state stack underflow
271 // This will occur if the user pattern has mis-matched parentheses,
272 // with extra close parens.
273 //
274 fStackPtr++;
275 error(U_REGEX_MISMATCHED_PAREN);
276 }
277 }
278
279 }
280
281 if (U_FAILURE(*fStatus)) {
282 // Bail out if the pattern had errors.
283 // Set stack cleanup: a successful compile would have left it empty,
284 // but errors can leave temporary sets hanging around.
285 while (!fSetStack.empty()) {
286 delete (UnicodeSet *)fSetStack.pop();
287 }
288 return;
289 }
290
291 //
292 // The pattern has now been read and processed, and the compiled code generated.
293 //
294
295 //
296 // The pattern's fFrameSize so far has accumulated the requirements for
297 // storage for capture parentheses, counters, etc. that are encountered
298 // in the pattern. Add space for the two variables that are always
299 // present in the saved state: the input string position (int64_t) and
300 // the position in the compiled pattern.
301 //
302 allocateStackData(RESTACKFRAME_HDRCOUNT);
303
304 //
305 // Optimization pass 1: NOPs, back-references, and case-folding
306 //
307 stripNOPs();
308
309 //
310 // Get bounds for the minimum and maximum length of a string that this
311 // pattern can match. Used to avoid looking for matches in strings that
312 // are too short.
313 //
314 fRXPat->fMinMatchLen = minMatchLength(3, fRXPat->fCompiledPat->size()-1);
315
316 //
317 // Optimization pass 2: match start type
318 //
319 matchStartType();
320
321 //
322 // Set up fast latin-1 range sets
323 //
324 int32_t numSets = fRXPat->fSets->size();
325 fRXPat->fSets8 = new Regex8BitSet[numSets];
326 // Null pointer check.
327 if (fRXPat->fSets8 == NULL) {
328 e = *fStatus = U_MEMORY_ALLOCATION_ERROR;
329 return;
330 }
331 int32_t i;
332 for (i=0; i<numSets; i++) {
333 UnicodeSet *s = (UnicodeSet *)fRXPat->fSets->elementAt(i);
334 fRXPat->fSets8[i].init(s);
335 }
336
337 }
338
339
340
341
342
343 //------------------------------------------------------------------------------
344 //
345 // doParseAction Do some action during regex pattern parsing.
346 // Called by the parse state machine.
347 //
348 // Generation of the match engine PCode happens here, or
349 // in functions called from the parse actions defined here.
350 //
351 //
352 //------------------------------------------------------------------------------
doParseActions(int32_t action)353 UBool RegexCompile::doParseActions(int32_t action)
354 {
355 UBool returnVal = TRUE;
356
357 switch ((Regex_PatternParseAction)action) {
358
359 case doPatStart:
360 // Start of pattern compiles to:
361 //0 SAVE 2 Fall back to position of FAIL
362 //1 jmp 3
363 //2 FAIL Stop if we ever reach here.
364 //3 NOP Dummy, so start of pattern looks the same as
365 // the start of an ( grouping.
366 //4 NOP Resreved, will be replaced by a save if there are
367 // OR | operators at the top level
368 appendOp(URX_STATE_SAVE, 2);
369 appendOp(URX_JMP, 3);
370 appendOp(URX_FAIL, 0);
371
372 // Standard open nonCapture paren action emits the two NOPs and
373 // sets up the paren stack frame.
374 doParseActions(doOpenNonCaptureParen);
375 break;
376
377 case doPatFinish:
378 // We've scanned to the end of the pattern
379 // The end of pattern compiles to:
380 // URX_END
381 // which will stop the runtime match engine.
382 // Encountering end of pattern also behaves like a close paren,
383 // and forces fixups of the State Save at the beginning of the compiled pattern
384 // and of any OR operations at the top level.
385 //
386 handleCloseParen();
387 if (fParenStack.size() > 0) {
388 // Missing close paren in pattern.
389 error(U_REGEX_MISMATCHED_PAREN);
390 }
391
392 // add the END operation to the compiled pattern.
393 appendOp(URX_END, 0);
394
395 // Terminate the pattern compilation state machine.
396 returnVal = FALSE;
397 break;
398
399
400
401 case doOrOperator:
402 // Scanning a '|', as in (A|B)
403 {
404 // Generate code for any pending literals preceding the '|'
405 fixLiterals(FALSE);
406
407 // Insert a SAVE operation at the start of the pattern section preceding
408 // this OR at this level. This SAVE will branch the match forward
409 // to the right hand side of the OR in the event that the left hand
410 // side fails to match and backtracks. Locate the position for the
411 // save from the location on the top of the parentheses stack.
412 int32_t savePosition = fParenStack.popi();
413 int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(savePosition);
414 U_ASSERT(URX_TYPE(op) == URX_NOP); // original contents of reserved location
415 op = buildOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+1);
416 fRXPat->fCompiledPat->setElementAt(op, savePosition);
417
418 // Append an JMP operation into the compiled pattern. The operand for
419 // the JMP will eventually be the location following the ')' for the
420 // group. This will be patched in later, when the ')' is encountered.
421 appendOp(URX_JMP, 0);
422
423 // Push the position of the newly added JMP op onto the parentheses stack.
424 // This registers if for fixup when this block's close paren is encountered.
425 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);
426
427 // Append a NOP to the compiled pattern. This is the slot reserved
428 // for a SAVE in the event that there is yet another '|' following
429 // this one.
430 appendOp(URX_NOP, 0);
431 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);
432 }
433 break;
434
435
436 case doBeginNamedCapture:
437 // Scanning (?<letter.
438 // The first letter of the name will come through again under doConinueNamedCapture.
439 fCaptureName = new UnicodeString();
440 if (fCaptureName == NULL) {
441 error(U_MEMORY_ALLOCATION_ERROR);
442 }
443 break;
444
445 case doContinueNamedCapture:
446 fCaptureName->append(fC.fChar);
447 break;
448
449 case doBadNamedCapture:
450 error(U_REGEX_INVALID_CAPTURE_GROUP_NAME);
451 break;
452
453 case doOpenCaptureParen:
454 // Open Capturing Paren, possibly named.
455 // Compile to a
456 // - NOP, which later may be replaced by a save-state if the
457 // parenthesized group gets a * quantifier, followed by
458 // - START_CAPTURE n where n is stack frame offset to the capture group variables.
459 // - NOP, which may later be replaced by a save-state if there
460 // is an '|' alternation within the parens.
461 //
462 // Each capture group gets three slots in the save stack frame:
463 // 0: Capture Group start position (in input string being matched.)
464 // 1: Capture Group end position.
465 // 2: Start of Match-in-progress.
466 // The first two locations are for a completed capture group, and are
467 // referred to by back references and the like.
468 // The third location stores the capture start position when an START_CAPTURE is
469 // encountered. This will be promoted to a completed capture when (and if) the corresponding
470 // END_CAPTURE is encountered.
471 {
472 fixLiterals();
473 appendOp(URX_NOP, 0);
474 int32_t varsLoc = allocateStackData(3); // Reserve three slots in match stack frame.
475 appendOp(URX_START_CAPTURE, varsLoc);
476 appendOp(URX_NOP, 0);
477
478 // On the Parentheses stack, start a new frame and add the postions
479 // of the two NOPs. Depending on what follows in the pattern, the
480 // NOPs may be changed to SAVE_STATE or JMP ops, with a target
481 // address of the end of the parenthesized group.
482 fParenStack.push(fModeFlags, *fStatus); // Match mode state
483 fParenStack.push(capturing, *fStatus); // Frame type.
484 fParenStack.push(fRXPat->fCompiledPat->size()-3, *fStatus); // The first NOP location
485 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP loc
486
487 // Save the mapping from group number to stack frame variable position.
488 fRXPat->fGroupMap->addElement(varsLoc, *fStatus);
489
490 // If this is a named capture group, add the name->group number mapping.
491 if (fCaptureName != NULL) {
492 int32_t groupNumber = fRXPat->fGroupMap->size();
493 int32_t previousMapping = uhash_puti(fRXPat->fNamedCaptureMap, fCaptureName, groupNumber, fStatus);
494 fCaptureName = NULL; // hash table takes ownership of the name (key) string.
495 if (previousMapping > 0 && U_SUCCESS(*fStatus)) {
496 error(U_REGEX_INVALID_CAPTURE_GROUP_NAME);
497 }
498 }
499 }
500 break;
501
502 case doOpenNonCaptureParen:
503 // Open non-caputuring (grouping only) Paren.
504 // Compile to a
505 // - NOP, which later may be replaced by a save-state if the
506 // parenthesized group gets a * quantifier, followed by
507 // - NOP, which may later be replaced by a save-state if there
508 // is an '|' alternation within the parens.
509 {
510 fixLiterals();
511 appendOp(URX_NOP, 0);
512 appendOp(URX_NOP, 0);
513
514 // On the Parentheses stack, start a new frame and add the postions
515 // of the two NOPs.
516 fParenStack.push(fModeFlags, *fStatus); // Match mode state
517 fParenStack.push(plain, *fStatus); // Begin a new frame.
518 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The first NOP location
519 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP loc
520 }
521 break;
522
523
524 case doOpenAtomicParen:
525 // Open Atomic Paren. (?>
526 // Compile to a
527 // - NOP, which later may be replaced if the parenthesized group
528 // has a quantifier, followed by
529 // - STO_SP save state stack position, so it can be restored at the ")"
530 // - NOP, which may later be replaced by a save-state if there
531 // is an '|' alternation within the parens.
532 {
533 fixLiterals();
534 appendOp(URX_NOP, 0);
535 int32_t varLoc = allocateData(1); // Reserve a data location for saving the state stack ptr.
536 appendOp(URX_STO_SP, varLoc);
537 appendOp(URX_NOP, 0);
538
539 // On the Parentheses stack, start a new frame and add the postions
540 // of the two NOPs. Depending on what follows in the pattern, the
541 // NOPs may be changed to SAVE_STATE or JMP ops, with a target
542 // address of the end of the parenthesized group.
543 fParenStack.push(fModeFlags, *fStatus); // Match mode state
544 fParenStack.push(atomic, *fStatus); // Frame type.
545 fParenStack.push(fRXPat->fCompiledPat->size()-3, *fStatus); // The first NOP
546 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP
547 }
548 break;
549
550
551 case doOpenLookAhead:
552 // Positive Look-ahead (?= stuff )
553 //
554 // Note: Addition of transparent input regions, with the need to
555 // restore the original regions when failing out of a lookahead
556 // block, complicated this sequence. Some conbined opcodes
557 // might make sense - or might not, lookahead aren't that common.
558 //
559 // Caution: min match length optimization knows about this
560 // sequence; don't change without making updates there too.
561 //
562 // Compiles to
563 // 1 START_LA dataLoc Saves SP, Input Pos
564 // 2. STATE_SAVE 4 on failure of lookahead, goto 4
565 // 3 JMP 6 continue ...
566 //
567 // 4. LA_END Look Ahead failed. Restore regions.
568 // 5. BACKTRACK and back track again.
569 //
570 // 6. NOP reserved for use by quantifiers on the block.
571 // Look-ahead can't have quantifiers, but paren stack
572 // compile time conventions require the slot anyhow.
573 // 7. NOP may be replaced if there is are '|' ops in the block.
574 // 8. code for parenthesized stuff.
575 // 9. LA_END
576 //
577 // Two data slots are reserved, for saving the stack ptr and the input position.
578 {
579 fixLiterals();
580 int32_t dataLoc = allocateData(2);
581 appendOp(URX_LA_START, dataLoc);
582 appendOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+ 2);
583 appendOp(URX_JMP, fRXPat->fCompiledPat->size()+ 3);
584 appendOp(URX_LA_END, dataLoc);
585 appendOp(URX_BACKTRACK, 0);
586 appendOp(URX_NOP, 0);
587 appendOp(URX_NOP, 0);
588
589 // On the Parentheses stack, start a new frame and add the postions
590 // of the NOPs.
591 fParenStack.push(fModeFlags, *fStatus); // Match mode state
592 fParenStack.push(lookAhead, *fStatus); // Frame type.
593 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The first NOP location
594 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP location
595 }
596 break;
597
598 case doOpenLookAheadNeg:
599 // Negated Lookahead. (?! stuff )
600 // Compiles to
601 // 1. START_LA dataloc
602 // 2. SAVE_STATE 7 // Fail within look-ahead block restores to this state,
603 // // which continues with the match.
604 // 3. NOP // Std. Open Paren sequence, for possible '|'
605 // 4. code for parenthesized stuff.
606 // 5. END_LA // Cut back stack, remove saved state from step 2.
607 // 6. BACKTRACK // code in block succeeded, so neg. lookahead fails.
608 // 7. END_LA // Restore match region, in case look-ahead was using
609 // an alternate (transparent) region.
610 {
611 fixLiterals();
612 int32_t dataLoc = allocateData(2);
613 appendOp(URX_LA_START, dataLoc);
614 appendOp(URX_STATE_SAVE, 0); // dest address will be patched later.
615 appendOp(URX_NOP, 0);
616
617 // On the Parentheses stack, start a new frame and add the postions
618 // of the StateSave and NOP.
619 fParenStack.push(fModeFlags, *fStatus); // Match mode state
620 fParenStack.push(negLookAhead, *fStatus); // Frame type
621 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The STATE_SAVE location
622 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP location
623
624 // Instructions #5 - #7 will be added when the ')' is encountered.
625 }
626 break;
627
628 case doOpenLookBehind:
629 {
630 // Compile a (?<= look-behind open paren.
631 //
632 // Compiles to
633 // 0 URX_LB_START dataLoc
634 // 1 URX_LB_CONT dataLoc
635 // 2 MinMatchLen
636 // 3 MaxMatchLen
637 // 4 URX_NOP Standard '(' boilerplate.
638 // 5 URX_NOP Reserved slot for use with '|' ops within (block).
639 // 6 <code for LookBehind expression>
640 // 7 URX_LB_END dataLoc # Check match len, restore input len
641 // 8 URX_LA_END dataLoc # Restore stack, input pos
642 //
643 // Allocate a block of matcher data, to contain (when running a match)
644 // 0: Stack ptr on entry
645 // 1: Input Index on entry
646 // 2: Start index of match current match attempt.
647 // 3: Original Input String len.
648
649 // Generate match code for any pending literals.
650 fixLiterals();
651
652 // Allocate data space
653 int32_t dataLoc = allocateData(4);
654
655 // Emit URX_LB_START
656 appendOp(URX_LB_START, dataLoc);
657
658 // Emit URX_LB_CONT
659 appendOp(URX_LB_CONT, dataLoc);
660 appendOp(URX_RESERVED_OP, 0); // MinMatchLength. To be filled later.
661 appendOp(URX_RESERVED_OP, 0); // MaxMatchLength. To be filled later.
662
663 // Emit the NOPs
664 appendOp(URX_NOP, 0);
665 appendOp(URX_NOP, 0);
666
667 // On the Parentheses stack, start a new frame and add the postions
668 // of the URX_LB_CONT and the NOP.
669 fParenStack.push(fModeFlags, *fStatus); // Match mode state
670 fParenStack.push(lookBehind, *fStatus); // Frame type
671 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The first NOP location
672 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The 2nd NOP location
673
674 // The final two instructions will be added when the ')' is encountered.
675 }
676
677 break;
678
679 case doOpenLookBehindNeg:
680 {
681 // Compile a (?<! negated look-behind open paren.
682 //
683 // Compiles to
684 // 0 URX_LB_START dataLoc # Save entry stack, input len
685 // 1 URX_LBN_CONT dataLoc # Iterate possible match positions
686 // 2 MinMatchLen
687 // 3 MaxMatchLen
688 // 4 continueLoc (9)
689 // 5 URX_NOP Standard '(' boilerplate.
690 // 6 URX_NOP Reserved slot for use with '|' ops within (block).
691 // 7 <code for LookBehind expression>
692 // 8 URX_LBN_END dataLoc # Check match len, cause a FAIL
693 // 9 ...
694 //
695 // Allocate a block of matcher data, to contain (when running a match)
696 // 0: Stack ptr on entry
697 // 1: Input Index on entry
698 // 2: Start index of match current match attempt.
699 // 3: Original Input String len.
700
701 // Generate match code for any pending literals.
702 fixLiterals();
703
704 // Allocate data space
705 int32_t dataLoc = allocateData(4);
706
707 // Emit URX_LB_START
708 appendOp(URX_LB_START, dataLoc);
709
710 // Emit URX_LBN_CONT
711 appendOp(URX_LBN_CONT, dataLoc);
712 appendOp(URX_RESERVED_OP, 0); // MinMatchLength. To be filled later.
713 appendOp(URX_RESERVED_OP, 0); // MaxMatchLength. To be filled later.
714 appendOp(URX_RESERVED_OP, 0); // Continue Loc. To be filled later.
715
716 // Emit the NOPs
717 appendOp(URX_NOP, 0);
718 appendOp(URX_NOP, 0);
719
720 // On the Parentheses stack, start a new frame and add the postions
721 // of the URX_LB_CONT and the NOP.
722 fParenStack.push(fModeFlags, *fStatus); // Match mode state
723 fParenStack.push(lookBehindN, *fStatus); // Frame type
724 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The first NOP location
725 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The 2nd NOP location
726
727 // The final two instructions will be added when the ')' is encountered.
728 }
729 break;
730
731 case doConditionalExpr:
732 // Conditionals such as (?(1)a:b)
733 case doPerlInline:
734 // Perl inline-condtionals. (?{perl code}a|b) We're not perl, no way to do them.
735 error(U_REGEX_UNIMPLEMENTED);
736 break;
737
738
739 case doCloseParen:
740 handleCloseParen();
741 if (fParenStack.size() <= 0) {
742 // Extra close paren, or missing open paren.
743 error(U_REGEX_MISMATCHED_PAREN);
744 }
745 break;
746
747 case doNOP:
748 break;
749
750
751 case doBadOpenParenType:
752 case doRuleError:
753 error(U_REGEX_RULE_SYNTAX);
754 break;
755
756
757 case doMismatchedParenErr:
758 error(U_REGEX_MISMATCHED_PAREN);
759 break;
760
761 case doPlus:
762 // Normal '+' compiles to
763 // 1. stuff to be repeated (already built)
764 // 2. jmp-sav 1
765 // 3. ...
766 //
767 // Or, if the item to be repeated can match a zero length string,
768 // 1. STO_INP_LOC data-loc
769 // 2. body of stuff to be repeated
770 // 3. JMP_SAV_X 2
771 // 4. ...
772
773 //
774 // Or, if the item to be repeated is simple
775 // 1. Item to be repeated.
776 // 2. LOOP_SR_I set number (assuming repeated item is a set ref)
777 // 3. LOOP_C stack location
778 {
779 int32_t topLoc = blockTopLoc(FALSE); // location of item #1
780 int32_t frameLoc;
781
782 // Check for simple constructs, which may get special optimized code.
783 if (topLoc == fRXPat->fCompiledPat->size() - 1) {
784 int32_t repeatedOp = (int32_t)fRXPat->fCompiledPat->elementAti(topLoc);
785
786 if (URX_TYPE(repeatedOp) == URX_SETREF) {
787 // Emit optimized code for [char set]+
788 appendOp(URX_LOOP_SR_I, URX_VAL(repeatedOp));
789 frameLoc = allocateStackData(1);
790 appendOp(URX_LOOP_C, frameLoc);
791 break;
792 }
793
794 if (URX_TYPE(repeatedOp) == URX_DOTANY ||
795 URX_TYPE(repeatedOp) == URX_DOTANY_ALL ||
796 URX_TYPE(repeatedOp) == URX_DOTANY_UNIX) {
797 // Emit Optimized code for .+ operations.
798 int32_t loopOpI = buildOp(URX_LOOP_DOT_I, 0);
799 if (URX_TYPE(repeatedOp) == URX_DOTANY_ALL) {
800 // URX_LOOP_DOT_I operand is a flag indicating ". matches any" mode.
801 loopOpI |= 1;
802 }
803 if (fModeFlags & UREGEX_UNIX_LINES) {
804 loopOpI |= 2;
805 }
806 appendOp(loopOpI);
807 frameLoc = allocateStackData(1);
808 appendOp(URX_LOOP_C, frameLoc);
809 break;
810 }
811
812 }
813
814 // General case.
815
816 // Check for minimum match length of zero, which requires
817 // extra loop-breaking code.
818 if (minMatchLength(topLoc, fRXPat->fCompiledPat->size()-1) == 0) {
819 // Zero length match is possible.
820 // Emit the code sequence that can handle it.
821 insertOp(topLoc);
822 frameLoc = allocateStackData(1);
823
824 int32_t op = buildOp(URX_STO_INP_LOC, frameLoc);
825 fRXPat->fCompiledPat->setElementAt(op, topLoc);
826
827 appendOp(URX_JMP_SAV_X, topLoc+1);
828 } else {
829 // Simpler code when the repeated body must match something non-empty
830 appendOp(URX_JMP_SAV, topLoc);
831 }
832 }
833 break;
834
835 case doNGPlus:
836 // Non-greedy '+?' compiles to
837 // 1. stuff to be repeated (already built)
838 // 2. state-save 1
839 // 3. ...
840 {
841 int32_t topLoc = blockTopLoc(FALSE);
842 appendOp(URX_STATE_SAVE, topLoc);
843 }
844 break;
845
846
847 case doOpt:
848 // Normal (greedy) ? quantifier.
849 // Compiles to
850 // 1. state save 3
851 // 2. body of optional block
852 // 3. ...
853 // Insert the state save into the compiled pattern, and we're done.
854 {
855 int32_t saveStateLoc = blockTopLoc(TRUE);
856 int32_t saveStateOp = buildOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size());
857 fRXPat->fCompiledPat->setElementAt(saveStateOp, saveStateLoc);
858 }
859 break;
860
861 case doNGOpt:
862 // Non-greedy ?? quantifier
863 // compiles to
864 // 1. jmp 4
865 // 2. body of optional block
866 // 3 jmp 5
867 // 4. state save 2
868 // 5 ...
869 // This code is less than ideal, with two jmps instead of one, because we can only
870 // insert one instruction at the top of the block being iterated.
871 {
872 int32_t jmp1_loc = blockTopLoc(TRUE);
873 int32_t jmp2_loc = fRXPat->fCompiledPat->size();
874
875 int32_t jmp1_op = buildOp(URX_JMP, jmp2_loc+1);
876 fRXPat->fCompiledPat->setElementAt(jmp1_op, jmp1_loc);
877
878 appendOp(URX_JMP, jmp2_loc+2);
879
880 appendOp(URX_STATE_SAVE, jmp1_loc+1);
881 }
882 break;
883
884
885 case doStar:
886 // Normal (greedy) * quantifier.
887 // Compiles to
888 // 1. STATE_SAVE 4
889 // 2. body of stuff being iterated over
890 // 3. JMP_SAV 2
891 // 4. ...
892 //
893 // Or, if the body is a simple [Set],
894 // 1. LOOP_SR_I set number
895 // 2. LOOP_C stack location
896 // ...
897 //
898 // Or if this is a .*
899 // 1. LOOP_DOT_I (. matches all mode flag)
900 // 2. LOOP_C stack location
901 //
902 // Or, if the body can match a zero-length string, to inhibit infinite loops,
903 // 1. STATE_SAVE 5
904 // 2. STO_INP_LOC data-loc
905 // 3. body of stuff
906 // 4. JMP_SAV_X 2
907 // 5. ...
908 {
909 // location of item #1, the STATE_SAVE
910 int32_t topLoc = blockTopLoc(FALSE);
911 int32_t dataLoc = -1;
912
913 // Check for simple *, where the construct being repeated
914 // compiled to single opcode, and might be optimizable.
915 if (topLoc == fRXPat->fCompiledPat->size() - 1) {
916 int32_t repeatedOp = (int32_t)fRXPat->fCompiledPat->elementAti(topLoc);
917
918 if (URX_TYPE(repeatedOp) == URX_SETREF) {
919 // Emit optimized code for a [char set]*
920 int32_t loopOpI = buildOp(URX_LOOP_SR_I, URX_VAL(repeatedOp));
921 fRXPat->fCompiledPat->setElementAt(loopOpI, topLoc);
922 dataLoc = allocateStackData(1);
923 appendOp(URX_LOOP_C, dataLoc);
924 break;
925 }
926
927 if (URX_TYPE(repeatedOp) == URX_DOTANY ||
928 URX_TYPE(repeatedOp) == URX_DOTANY_ALL ||
929 URX_TYPE(repeatedOp) == URX_DOTANY_UNIX) {
930 // Emit Optimized code for .* operations.
931 int32_t loopOpI = buildOp(URX_LOOP_DOT_I, 0);
932 if (URX_TYPE(repeatedOp) == URX_DOTANY_ALL) {
933 // URX_LOOP_DOT_I operand is a flag indicating . matches any mode.
934 loopOpI |= 1;
935 }
936 if ((fModeFlags & UREGEX_UNIX_LINES) != 0) {
937 loopOpI |= 2;
938 }
939 fRXPat->fCompiledPat->setElementAt(loopOpI, topLoc);
940 dataLoc = allocateStackData(1);
941 appendOp(URX_LOOP_C, dataLoc);
942 break;
943 }
944 }
945
946 // Emit general case code for this *
947 // The optimizations did not apply.
948
949 int32_t saveStateLoc = blockTopLoc(TRUE);
950 int32_t jmpOp = buildOp(URX_JMP_SAV, saveStateLoc+1);
951
952 // Check for minimum match length of zero, which requires
953 // extra loop-breaking code.
954 if (minMatchLength(saveStateLoc, fRXPat->fCompiledPat->size()-1) == 0) {
955 insertOp(saveStateLoc);
956 dataLoc = allocateStackData(1);
957
958 int32_t op = buildOp(URX_STO_INP_LOC, dataLoc);
959 fRXPat->fCompiledPat->setElementAt(op, saveStateLoc+1);
960 jmpOp = buildOp(URX_JMP_SAV_X, saveStateLoc+2);
961 }
962
963 // Locate the position in the compiled pattern where the match will continue
964 // after completing the *. (4 or 5 in the comment above)
965 int32_t continueLoc = fRXPat->fCompiledPat->size()+1;
966
967 // Put together the save state op and store it into the compiled code.
968 int32_t saveStateOp = buildOp(URX_STATE_SAVE, continueLoc);
969 fRXPat->fCompiledPat->setElementAt(saveStateOp, saveStateLoc);
970
971 // Append the URX_JMP_SAV or URX_JMPX operation to the compiled pattern.
972 appendOp(jmpOp);
973 }
974 break;
975
976 case doNGStar:
977 // Non-greedy *? quantifier
978 // compiles to
979 // 1. JMP 3
980 // 2. body of stuff being iterated over
981 // 3. STATE_SAVE 2
982 // 4 ...
983 {
984 int32_t jmpLoc = blockTopLoc(TRUE); // loc 1.
985 int32_t saveLoc = fRXPat->fCompiledPat->size(); // loc 3.
986 int32_t jmpOp = buildOp(URX_JMP, saveLoc);
987 fRXPat->fCompiledPat->setElementAt(jmpOp, jmpLoc);
988 appendOp(URX_STATE_SAVE, jmpLoc+1);
989 }
990 break;
991
992
993 case doIntervalInit:
994 // The '{' opening an interval quantifier was just scanned.
995 // Init the counter varaiables that will accumulate the values as the digits
996 // are scanned.
997 fIntervalLow = 0;
998 fIntervalUpper = -1;
999 break;
1000
1001 case doIntevalLowerDigit:
1002 // Scanned a digit from the lower value of an {lower,upper} interval
1003 {
1004 int32_t digitValue = u_charDigitValue(fC.fChar);
1005 U_ASSERT(digitValue >= 0);
1006 int64_t val = (int64_t)fIntervalLow*10 + digitValue;
1007 if (val > INT32_MAX) {
1008 error(U_REGEX_NUMBER_TOO_BIG);
1009 } else {
1010 fIntervalLow = (int32_t)val;
1011 }
1012 }
1013 break;
1014
1015 case doIntervalUpperDigit:
1016 // Scanned a digit from the upper value of an {lower,upper} interval
1017 {
1018 if (fIntervalUpper < 0) {
1019 fIntervalUpper = 0;
1020 }
1021 int32_t digitValue = u_charDigitValue(fC.fChar);
1022 U_ASSERT(digitValue >= 0);
1023 int64_t val = (int64_t)fIntervalUpper*10 + digitValue;
1024 if (val > INT32_MAX) {
1025 error(U_REGEX_NUMBER_TOO_BIG);
1026 } else {
1027 fIntervalUpper = (int32_t)val;
1028 }
1029 }
1030 break;
1031
1032 case doIntervalSame:
1033 // Scanned a single value interval like {27}. Upper = Lower.
1034 fIntervalUpper = fIntervalLow;
1035 break;
1036
1037 case doInterval:
1038 // Finished scanning a normal {lower,upper} interval. Generate the code for it.
1039 if (compileInlineInterval() == FALSE) {
1040 compileInterval(URX_CTR_INIT, URX_CTR_LOOP);
1041 }
1042 break;
1043
1044 case doPossessiveInterval:
1045 // Finished scanning a Possessive {lower,upper}+ interval. Generate the code for it.
1046 {
1047 // Remember the loc for the top of the block being looped over.
1048 // (Can not reserve a slot in the compiled pattern at this time, because
1049 // compileInterval needs to reserve also, and blockTopLoc can only reserve
1050 // once per block.)
1051 int32_t topLoc = blockTopLoc(FALSE);
1052
1053 // Produce normal looping code.
1054 compileInterval(URX_CTR_INIT, URX_CTR_LOOP);
1055
1056 // Surround the just-emitted normal looping code with a STO_SP ... LD_SP
1057 // just as if the loop was inclosed in atomic parentheses.
1058
1059 // First the STO_SP before the start of the loop
1060 insertOp(topLoc);
1061
1062 int32_t varLoc = allocateData(1); // Reserve a data location for saving the
1063 int32_t op = buildOp(URX_STO_SP, varLoc);
1064 fRXPat->fCompiledPat->setElementAt(op, topLoc);
1065
1066 int32_t loopOp = (int32_t)fRXPat->fCompiledPat->popi();
1067 U_ASSERT(URX_TYPE(loopOp) == URX_CTR_LOOP && URX_VAL(loopOp) == topLoc);
1068 loopOp++; // point LoopOp after the just-inserted STO_SP
1069 fRXPat->fCompiledPat->push(loopOp, *fStatus);
1070
1071 // Then the LD_SP after the end of the loop
1072 appendOp(URX_LD_SP, varLoc);
1073 }
1074
1075 break;
1076
1077 case doNGInterval:
1078 // Finished scanning a non-greedy {lower,upper}? interval. Generate the code for it.
1079 compileInterval(URX_CTR_INIT_NG, URX_CTR_LOOP_NG);
1080 break;
1081
1082 case doIntervalError:
1083 error(U_REGEX_BAD_INTERVAL);
1084 break;
1085
1086 case doLiteralChar:
1087 // We've just scanned a "normal" character from the pattern,
1088 literalChar(fC.fChar);
1089 break;
1090
1091
1092 case doEscapedLiteralChar:
1093 // We've just scanned an backslashed escaped character with no
1094 // special meaning. It represents itself.
1095 if ((fModeFlags & UREGEX_ERROR_ON_UNKNOWN_ESCAPES) != 0 &&
1096 ((fC.fChar >= 0x41 && fC.fChar<= 0x5A) || // in [A-Z]
1097 (fC.fChar >= 0x61 && fC.fChar <= 0x7a))) { // in [a-z]
1098 error(U_REGEX_BAD_ESCAPE_SEQUENCE);
1099 }
1100 literalChar(fC.fChar);
1101 break;
1102
1103
1104 case doDotAny:
1105 // scanned a ".", match any single character.
1106 {
1107 fixLiterals(FALSE);
1108 if (fModeFlags & UREGEX_DOTALL) {
1109 appendOp(URX_DOTANY_ALL, 0);
1110 } else if (fModeFlags & UREGEX_UNIX_LINES) {
1111 appendOp(URX_DOTANY_UNIX, 0);
1112 } else {
1113 appendOp(URX_DOTANY, 0);
1114 }
1115 }
1116 break;
1117
1118 case doCaret:
1119 {
1120 fixLiterals(FALSE);
1121 if ( (fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) {
1122 appendOp(URX_CARET, 0);
1123 } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) {
1124 appendOp(URX_CARET_M, 0);
1125 } else if ((fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) {
1126 appendOp(URX_CARET, 0); // Only testing true start of input.
1127 } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) {
1128 appendOp(URX_CARET_M_UNIX, 0);
1129 }
1130 }
1131 break;
1132
1133 case doDollar:
1134 {
1135 fixLiterals(FALSE);
1136 if ( (fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) {
1137 appendOp(URX_DOLLAR, 0);
1138 } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) {
1139 appendOp(URX_DOLLAR_M, 0);
1140 } else if ((fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) {
1141 appendOp(URX_DOLLAR_D, 0);
1142 } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) {
1143 appendOp(URX_DOLLAR_MD, 0);
1144 }
1145 }
1146 break;
1147
1148 case doBackslashA:
1149 fixLiterals(FALSE);
1150 appendOp(URX_CARET, 0);
1151 break;
1152
1153 case doBackslashB:
1154 {
1155 #if UCONFIG_NO_BREAK_ITERATION==1
1156 if (fModeFlags & UREGEX_UWORD) {
1157 error(U_UNSUPPORTED_ERROR);
1158 }
1159 #endif
1160 fixLiterals(FALSE);
1161 int32_t op = (fModeFlags & UREGEX_UWORD)? URX_BACKSLASH_BU : URX_BACKSLASH_B;
1162 appendOp(op, 1);
1163 }
1164 break;
1165
1166 case doBackslashb:
1167 {
1168 #if UCONFIG_NO_BREAK_ITERATION==1
1169 if (fModeFlags & UREGEX_UWORD) {
1170 error(U_UNSUPPORTED_ERROR);
1171 }
1172 #endif
1173 fixLiterals(FALSE);
1174 int32_t op = (fModeFlags & UREGEX_UWORD)? URX_BACKSLASH_BU : URX_BACKSLASH_B;
1175 appendOp(op, 0);
1176 }
1177 break;
1178
1179 case doBackslashD:
1180 fixLiterals(FALSE);
1181 appendOp(URX_BACKSLASH_D, 1);
1182 break;
1183
1184 case doBackslashd:
1185 fixLiterals(FALSE);
1186 appendOp(URX_BACKSLASH_D, 0);
1187 break;
1188
1189 case doBackslashG:
1190 fixLiterals(FALSE);
1191 appendOp(URX_BACKSLASH_G, 0);
1192 break;
1193
1194 case doBackslashH:
1195 fixLiterals(FALSE);
1196 appendOp(URX_BACKSLASH_H, 1);
1197 break;
1198
1199 case doBackslashh:
1200 fixLiterals(FALSE);
1201 appendOp(URX_BACKSLASH_H, 0);
1202 break;
1203
1204 case doBackslashR:
1205 fixLiterals(FALSE);
1206 appendOp(URX_BACKSLASH_R, 0);
1207 break;
1208
1209 case doBackslashS:
1210 fixLiterals(FALSE);
1211 appendOp(URX_STAT_SETREF_N, URX_ISSPACE_SET);
1212 break;
1213
1214 case doBackslashs:
1215 fixLiterals(FALSE);
1216 appendOp(URX_STATIC_SETREF, URX_ISSPACE_SET);
1217 break;
1218
1219 case doBackslashV:
1220 fixLiterals(FALSE);
1221 appendOp(URX_BACKSLASH_V, 1);
1222 break;
1223
1224 case doBackslashv:
1225 fixLiterals(FALSE);
1226 appendOp(URX_BACKSLASH_V, 0);
1227 break;
1228
1229 case doBackslashW:
1230 fixLiterals(FALSE);
1231 appendOp(URX_STAT_SETREF_N, URX_ISWORD_SET);
1232 break;
1233
1234 case doBackslashw:
1235 fixLiterals(FALSE);
1236 appendOp(URX_STATIC_SETREF, URX_ISWORD_SET);
1237 break;
1238
1239 case doBackslashX:
1240 fixLiterals(FALSE);
1241 appendOp(URX_BACKSLASH_X, 0);
1242 break;
1243
1244
1245 case doBackslashZ:
1246 fixLiterals(FALSE);
1247 appendOp(URX_DOLLAR, 0);
1248 break;
1249
1250 case doBackslashz:
1251 fixLiterals(FALSE);
1252 appendOp(URX_BACKSLASH_Z, 0);
1253 break;
1254
1255 case doEscapeError:
1256 error(U_REGEX_BAD_ESCAPE_SEQUENCE);
1257 break;
1258
1259 case doExit:
1260 fixLiterals(FALSE);
1261 returnVal = FALSE;
1262 break;
1263
1264 case doProperty:
1265 {
1266 fixLiterals(FALSE);
1267 UnicodeSet *theSet = scanProp();
1268 compileSet(theSet);
1269 }
1270 break;
1271
1272 case doNamedChar:
1273 {
1274 UChar32 c = scanNamedChar();
1275 literalChar(c);
1276 }
1277 break;
1278
1279
1280 case doBackRef:
1281 // BackReference. Somewhat unusual in that the front-end can not completely parse
1282 // the regular expression, because the number of digits to be consumed
1283 // depends on the number of capture groups that have been defined. So
1284 // we have to do it here instead.
1285 {
1286 int32_t numCaptureGroups = fRXPat->fGroupMap->size();
1287 int32_t groupNum = 0;
1288 UChar32 c = fC.fChar;
1289
1290 for (;;) {
1291 // Loop once per digit, for max allowed number of digits in a back reference.
1292 int32_t digit = u_charDigitValue(c);
1293 groupNum = groupNum * 10 + digit;
1294 if (groupNum >= numCaptureGroups) {
1295 break;
1296 }
1297 c = peekCharLL();
1298 if (RegexStaticSets::gStaticSets->fRuleDigitsAlias->contains(c) == FALSE) {
1299 break;
1300 }
1301 nextCharLL();
1302 }
1303
1304 // Scan of the back reference in the source regexp is complete. Now generate
1305 // the compiled code for it.
1306 // Because capture groups can be forward-referenced by back-references,
1307 // we fill the operand with the capture group number. At the end
1308 // of compilation, it will be changed to the variable's location.
1309 U_ASSERT(groupNum > 0); // Shouldn't happen. '\0' begins an octal escape sequence,
1310 // and shouldn't enter this code path at all.
1311 fixLiterals(FALSE);
1312 if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
1313 appendOp(URX_BACKREF_I, groupNum);
1314 } else {
1315 appendOp(URX_BACKREF, groupNum);
1316 }
1317 }
1318 break;
1319
1320 case doBeginNamedBackRef:
1321 U_ASSERT(fCaptureName == NULL);
1322 fCaptureName = new UnicodeString;
1323 if (fCaptureName == NULL) {
1324 error(U_MEMORY_ALLOCATION_ERROR);
1325 }
1326 break;
1327
1328 case doContinueNamedBackRef:
1329 fCaptureName->append(fC.fChar);
1330 break;
1331
1332 case doCompleteNamedBackRef:
1333 {
1334 int32_t groupNumber = uhash_geti(fRXPat->fNamedCaptureMap, fCaptureName);
1335 if (groupNumber == 0) {
1336 // Group name has not been defined.
1337 // Could be a forward reference. If we choose to support them at some
1338 // future time, extra mechanism will be required at this point.
1339 error(U_REGEX_INVALID_CAPTURE_GROUP_NAME);
1340 } else {
1341 // Given the number, handle identically to a \n numbered back reference.
1342 // See comments above, under doBackRef
1343 fixLiterals(FALSE);
1344 if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
1345 appendOp(URX_BACKREF_I, groupNumber);
1346 } else {
1347 appendOp(URX_BACKREF, groupNumber);
1348 }
1349 }
1350 delete fCaptureName;
1351 fCaptureName = NULL;
1352 break;
1353 }
1354
1355 case doPossessivePlus:
1356 // Possessive ++ quantifier.
1357 // Compiles to
1358 // 1. STO_SP
1359 // 2. body of stuff being iterated over
1360 // 3. STATE_SAVE 5
1361 // 4. JMP 2
1362 // 5. LD_SP
1363 // 6. ...
1364 //
1365 // Note: TODO: This is pretty inefficient. A mass of saved state is built up
1366 // then unconditionally discarded. Perhaps introduce a new opcode. Ticket 6056
1367 //
1368 {
1369 // Emit the STO_SP
1370 int32_t topLoc = blockTopLoc(TRUE);
1371 int32_t stoLoc = allocateData(1); // Reserve the data location for storing save stack ptr.
1372 int32_t op = buildOp(URX_STO_SP, stoLoc);
1373 fRXPat->fCompiledPat->setElementAt(op, topLoc);
1374
1375 // Emit the STATE_SAVE
1376 appendOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+2);
1377
1378 // Emit the JMP
1379 appendOp(URX_JMP, topLoc+1);
1380
1381 // Emit the LD_SP
1382 appendOp(URX_LD_SP, stoLoc);
1383 }
1384 break;
1385
1386 case doPossessiveStar:
1387 // Possessive *+ quantifier.
1388 // Compiles to
1389 // 1. STO_SP loc
1390 // 2. STATE_SAVE 5
1391 // 3. body of stuff being iterated over
1392 // 4. JMP 2
1393 // 5. LD_SP loc
1394 // 6 ...
1395 // TODO: do something to cut back the state stack each time through the loop.
1396 {
1397 // Reserve two slots at the top of the block.
1398 int32_t topLoc = blockTopLoc(TRUE);
1399 insertOp(topLoc);
1400
1401 // emit STO_SP loc
1402 int32_t stoLoc = allocateData(1); // Reserve the data location for storing save stack ptr.
1403 int32_t op = buildOp(URX_STO_SP, stoLoc);
1404 fRXPat->fCompiledPat->setElementAt(op, topLoc);
1405
1406 // Emit the SAVE_STATE 5
1407 int32_t L7 = fRXPat->fCompiledPat->size()+1;
1408 op = buildOp(URX_STATE_SAVE, L7);
1409 fRXPat->fCompiledPat->setElementAt(op, topLoc+1);
1410
1411 // Append the JMP operation.
1412 appendOp(URX_JMP, topLoc+1);
1413
1414 // Emit the LD_SP loc
1415 appendOp(URX_LD_SP, stoLoc);
1416 }
1417 break;
1418
1419 case doPossessiveOpt:
1420 // Possessive ?+ quantifier.
1421 // Compiles to
1422 // 1. STO_SP loc
1423 // 2. SAVE_STATE 5
1424 // 3. body of optional block
1425 // 4. LD_SP loc
1426 // 5. ...
1427 //
1428 {
1429 // Reserve two slots at the top of the block.
1430 int32_t topLoc = blockTopLoc(TRUE);
1431 insertOp(topLoc);
1432
1433 // Emit the STO_SP
1434 int32_t stoLoc = allocateData(1); // Reserve the data location for storing save stack ptr.
1435 int32_t op = buildOp(URX_STO_SP, stoLoc);
1436 fRXPat->fCompiledPat->setElementAt(op, topLoc);
1437
1438 // Emit the SAVE_STATE
1439 int32_t continueLoc = fRXPat->fCompiledPat->size()+1;
1440 op = buildOp(URX_STATE_SAVE, continueLoc);
1441 fRXPat->fCompiledPat->setElementAt(op, topLoc+1);
1442
1443 // Emit the LD_SP
1444 appendOp(URX_LD_SP, stoLoc);
1445 }
1446 break;
1447
1448
1449 case doBeginMatchMode:
1450 fNewModeFlags = fModeFlags;
1451 fSetModeFlag = TRUE;
1452 break;
1453
1454 case doMatchMode: // (?i) and similar
1455 {
1456 int32_t bit = 0;
1457 switch (fC.fChar) {
1458 case 0x69: /* 'i' */ bit = UREGEX_CASE_INSENSITIVE; break;
1459 case 0x64: /* 'd' */ bit = UREGEX_UNIX_LINES; break;
1460 case 0x6d: /* 'm' */ bit = UREGEX_MULTILINE; break;
1461 case 0x73: /* 's' */ bit = UREGEX_DOTALL; break;
1462 case 0x75: /* 'u' */ bit = 0; /* Unicode casing */ break;
1463 case 0x77: /* 'w' */ bit = UREGEX_UWORD; break;
1464 case 0x78: /* 'x' */ bit = UREGEX_COMMENTS; break;
1465 case 0x2d: /* '-' */ fSetModeFlag = FALSE; break;
1466 default:
1467 U_ASSERT(FALSE); // Should never happen. Other chars are filtered out
1468 // by the scanner.
1469 }
1470 if (fSetModeFlag) {
1471 fNewModeFlags |= bit;
1472 } else {
1473 fNewModeFlags &= ~bit;
1474 }
1475 }
1476 break;
1477
1478 case doSetMatchMode:
1479 // Emit code to match any pending literals, using the not-yet changed match mode.
1480 fixLiterals();
1481
1482 // We've got a (?i) or similar. The match mode is being changed, but
1483 // the change is not scoped to a parenthesized block.
1484 U_ASSERT(fNewModeFlags < 0);
1485 fModeFlags = fNewModeFlags;
1486
1487 break;
1488
1489
1490 case doMatchModeParen:
1491 // We've got a (?i: or similar. Begin a parenthesized block, save old
1492 // mode flags so they can be restored at the close of the block.
1493 //
1494 // Compile to a
1495 // - NOP, which later may be replaced by a save-state if the
1496 // parenthesized group gets a * quantifier, followed by
1497 // - NOP, which may later be replaced by a save-state if there
1498 // is an '|' alternation within the parens.
1499 {
1500 fixLiterals(FALSE);
1501 appendOp(URX_NOP, 0);
1502 appendOp(URX_NOP, 0);
1503
1504 // On the Parentheses stack, start a new frame and add the postions
1505 // of the two NOPs (a normal non-capturing () frame, except for the
1506 // saving of the orignal mode flags.)
1507 fParenStack.push(fModeFlags, *fStatus);
1508 fParenStack.push(flags, *fStatus); // Frame Marker
1509 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The first NOP
1510 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP
1511
1512 // Set the current mode flags to the new values.
1513 U_ASSERT(fNewModeFlags < 0);
1514 fModeFlags = fNewModeFlags;
1515 }
1516 break;
1517
1518 case doBadModeFlag:
1519 error(U_REGEX_INVALID_FLAG);
1520 break;
1521
1522 case doSuppressComments:
1523 // We have just scanned a '(?'. We now need to prevent the character scanner from
1524 // treating a '#' as a to-the-end-of-line comment.
1525 // (This Perl compatibility just gets uglier and uglier to do...)
1526 fEOLComments = FALSE;
1527 break;
1528
1529
1530 case doSetAddAmp:
1531 {
1532 UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1533 set->add(chAmp);
1534 }
1535 break;
1536
1537 case doSetAddDash:
1538 {
1539 UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1540 set->add(chDash);
1541 }
1542 break;
1543
1544 case doSetBackslash_s:
1545 {
1546 UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1547 set->addAll(*RegexStaticSets::gStaticSets->fPropSets[URX_ISSPACE_SET]);
1548 break;
1549 }
1550
1551 case doSetBackslash_S:
1552 {
1553 UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1554 UnicodeSet SSet(*RegexStaticSets::gStaticSets->fPropSets[URX_ISSPACE_SET]);
1555 SSet.complement();
1556 set->addAll(SSet);
1557 break;
1558 }
1559
1560 case doSetBackslash_d:
1561 {
1562 UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1563 // TODO - make a static set, ticket 6058.
1564 addCategory(set, U_GC_ND_MASK, *fStatus);
1565 break;
1566 }
1567
1568 case doSetBackslash_D:
1569 {
1570 UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1571 UnicodeSet digits;
1572 // TODO - make a static set, ticket 6058.
1573 digits.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ND_MASK, *fStatus);
1574 digits.complement();
1575 set->addAll(digits);
1576 break;
1577 }
1578
1579 case doSetBackslash_h:
1580 {
1581 UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1582 UnicodeSet h;
1583 h.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ZS_MASK, *fStatus);
1584 h.add((UChar32)9); // Tab
1585 set->addAll(h);
1586 break;
1587 }
1588
1589 case doSetBackslash_H:
1590 {
1591 UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1592 UnicodeSet h;
1593 h.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ZS_MASK, *fStatus);
1594 h.add((UChar32)9); // Tab
1595 h.complement();
1596 set->addAll(h);
1597 break;
1598 }
1599
1600 case doSetBackslash_v:
1601 {
1602 UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1603 set->add((UChar32)0x0a, (UChar32)0x0d); // add range
1604 set->add((UChar32)0x85);
1605 set->add((UChar32)0x2028, (UChar32)0x2029);
1606 break;
1607 }
1608
1609 case doSetBackslash_V:
1610 {
1611 UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1612 UnicodeSet v;
1613 v.add((UChar32)0x0a, (UChar32)0x0d); // add range
1614 v.add((UChar32)0x85);
1615 v.add((UChar32)0x2028, (UChar32)0x2029);
1616 v.complement();
1617 set->addAll(v);
1618 break;
1619 }
1620
1621 case doSetBackslash_w:
1622 {
1623 UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1624 set->addAll(*RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET]);
1625 break;
1626 }
1627
1628 case doSetBackslash_W:
1629 {
1630 UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1631 UnicodeSet SSet(*RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET]);
1632 SSet.complement();
1633 set->addAll(SSet);
1634 break;
1635 }
1636
1637 case doSetBegin:
1638 fixLiterals(FALSE);
1639 fSetStack.push(new UnicodeSet(), *fStatus);
1640 fSetOpStack.push(setStart, *fStatus);
1641 if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) {
1642 fSetOpStack.push(setCaseClose, *fStatus);
1643 }
1644 break;
1645
1646 case doSetBeginDifference1:
1647 // We have scanned something like [[abc]-[
1648 // Set up a new UnicodeSet for the set beginning with the just-scanned '['
1649 // Push a Difference operator, which will cause the new set to be subtracted from what
1650 // went before once it is created.
1651 setPushOp(setDifference1);
1652 fSetOpStack.push(setStart, *fStatus);
1653 if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) {
1654 fSetOpStack.push(setCaseClose, *fStatus);
1655 }
1656 break;
1657
1658 case doSetBeginIntersection1:
1659 // We have scanned something like [[abc]&[
1660 // Need both the '&' operator and the open '[' operator.
1661 setPushOp(setIntersection1);
1662 fSetOpStack.push(setStart, *fStatus);
1663 if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) {
1664 fSetOpStack.push(setCaseClose, *fStatus);
1665 }
1666 break;
1667
1668 case doSetBeginUnion:
1669 // We have scanned something like [[abc][
1670 // Need to handle the union operation explicitly [[abc] | [
1671 setPushOp(setUnion);
1672 fSetOpStack.push(setStart, *fStatus);
1673 if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) {
1674 fSetOpStack.push(setCaseClose, *fStatus);
1675 }
1676 break;
1677
1678 case doSetDifference2:
1679 // We have scanned something like [abc--
1680 // Consider this to unambiguously be a set difference operator.
1681 setPushOp(setDifference2);
1682 break;
1683
1684 case doSetEnd:
1685 // Have encountered the ']' that closes a set.
1686 // Force the evaluation of any pending operations within this set,
1687 // leave the completed set on the top of the set stack.
1688 setEval(setEnd);
1689 U_ASSERT(fSetOpStack.peeki()==setStart);
1690 fSetOpStack.popi();
1691 break;
1692
1693 case doSetFinish:
1694 {
1695 // Finished a complete set expression, including all nested sets.
1696 // The close bracket has already triggered clearing out pending set operators,
1697 // the operator stack should be empty and the operand stack should have just
1698 // one entry, the result set.
1699 U_ASSERT(fSetOpStack.empty());
1700 UnicodeSet *theSet = (UnicodeSet *)fSetStack.pop();
1701 U_ASSERT(fSetStack.empty());
1702 compileSet(theSet);
1703 break;
1704 }
1705
1706 case doSetIntersection2:
1707 // Have scanned something like [abc&&
1708 setPushOp(setIntersection2);
1709 break;
1710
1711 case doSetLiteral:
1712 // Union the just-scanned literal character into the set being built.
1713 // This operation is the highest precedence set operation, so we can always do
1714 // it immediately, without waiting to see what follows. It is necessary to perform
1715 // any pending '-' or '&' operation first, because these have the same precedence
1716 // as union-ing in a literal'
1717 {
1718 setEval(setUnion);
1719 UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
1720 s->add(fC.fChar);
1721 fLastSetLiteral = fC.fChar;
1722 break;
1723 }
1724
1725 case doSetLiteralEscaped:
1726 // A back-slash escaped literal character was encountered.
1727 // Processing is the same as with setLiteral, above, with the addition of
1728 // the optional check for errors on escaped ASCII letters.
1729 {
1730 if ((fModeFlags & UREGEX_ERROR_ON_UNKNOWN_ESCAPES) != 0 &&
1731 ((fC.fChar >= 0x41 && fC.fChar<= 0x5A) || // in [A-Z]
1732 (fC.fChar >= 0x61 && fC.fChar <= 0x7a))) { // in [a-z]
1733 error(U_REGEX_BAD_ESCAPE_SEQUENCE);
1734 }
1735 setEval(setUnion);
1736 UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
1737 s->add(fC.fChar);
1738 fLastSetLiteral = fC.fChar;
1739 break;
1740 }
1741
1742 case doSetNamedChar:
1743 // Scanning a \N{UNICODE CHARACTER NAME}
1744 // Aside from the source of the character, the processing is identical to doSetLiteral,
1745 // above.
1746 {
1747 UChar32 c = scanNamedChar();
1748 setEval(setUnion);
1749 UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
1750 s->add(c);
1751 fLastSetLiteral = c;
1752 break;
1753 }
1754
1755 case doSetNamedRange:
1756 // We have scanned literal-\N{CHAR NAME}. Add the range to the set.
1757 // The left character is already in the set, and is saved in fLastSetLiteral.
1758 // The right side needs to be picked up, the scan is at the 'N'.
1759 // Lower Limit > Upper limit being an error matches both Java
1760 // and ICU UnicodeSet behavior.
1761 {
1762 UChar32 c = scanNamedChar();
1763 if (U_SUCCESS(*fStatus) && (fLastSetLiteral == U_SENTINEL || fLastSetLiteral > c)) {
1764 error(U_REGEX_INVALID_RANGE);
1765 }
1766 UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
1767 s->add(fLastSetLiteral, c);
1768 fLastSetLiteral = c;
1769 break;
1770 }
1771
1772
1773 case doSetNegate:
1774 // Scanned a '^' at the start of a set.
1775 // Push the negation operator onto the set op stack.
1776 // A twist for case-insensitive matching:
1777 // the case closure operation must happen _before_ negation.
1778 // But the case closure operation will already be on the stack if it's required.
1779 // This requires checking for case closure, and swapping the stack order
1780 // if it is present.
1781 {
1782 int32_t tosOp = fSetOpStack.peeki();
1783 if (tosOp == setCaseClose) {
1784 fSetOpStack.popi();
1785 fSetOpStack.push(setNegation, *fStatus);
1786 fSetOpStack.push(setCaseClose, *fStatus);
1787 } else {
1788 fSetOpStack.push(setNegation, *fStatus);
1789 }
1790 }
1791 break;
1792
1793 case doSetNoCloseError:
1794 error(U_REGEX_MISSING_CLOSE_BRACKET);
1795 break;
1796
1797 case doSetOpError:
1798 error(U_REGEX_RULE_SYNTAX); // -- or && at the end of a set. Illegal.
1799 break;
1800
1801 case doSetPosixProp:
1802 {
1803 UnicodeSet *s = scanPosixProp();
1804 if (s != NULL) {
1805 UnicodeSet *tos = (UnicodeSet *)fSetStack.peek();
1806 tos->addAll(*s);
1807 delete s;
1808 } // else error. scanProp() reported the error status already.
1809 }
1810 break;
1811
1812 case doSetProp:
1813 // Scanned a \p \P within [brackets].
1814 {
1815 UnicodeSet *s = scanProp();
1816 if (s != NULL) {
1817 UnicodeSet *tos = (UnicodeSet *)fSetStack.peek();
1818 tos->addAll(*s);
1819 delete s;
1820 } // else error. scanProp() reported the error status already.
1821 }
1822 break;
1823
1824
1825 case doSetRange:
1826 // We have scanned literal-literal. Add the range to the set.
1827 // The left character is already in the set, and is saved in fLastSetLiteral.
1828 // The right side is the current character.
1829 // Lower Limit > Upper limit being an error matches both Java
1830 // and ICU UnicodeSet behavior.
1831 {
1832
1833 if (fLastSetLiteral == U_SENTINEL || fLastSetLiteral > fC.fChar) {
1834 error(U_REGEX_INVALID_RANGE);
1835 }
1836 UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
1837 s->add(fLastSetLiteral, fC.fChar);
1838 break;
1839 }
1840
1841 default:
1842 U_ASSERT(FALSE);
1843 error(U_REGEX_INTERNAL_ERROR);
1844 break;
1845 }
1846
1847 if (U_FAILURE(*fStatus)) {
1848 returnVal = FALSE;
1849 }
1850
1851 return returnVal;
1852 }
1853
1854
1855
1856 //------------------------------------------------------------------------------
1857 //
1858 // literalChar We've encountered a literal character from the pattern,
1859 // or an escape sequence that reduces to a character.
1860 // Add it to the string containing all literal chars/strings from
1861 // the pattern.
1862 //
1863 //------------------------------------------------------------------------------
literalChar(UChar32 c)1864 void RegexCompile::literalChar(UChar32 c) {
1865 fLiteralChars.append(c);
1866 }
1867
1868
1869 //------------------------------------------------------------------------------
1870 //
1871 // fixLiterals When compiling something that can follow a literal
1872 // string in a pattern, emit the code to match the
1873 // accumulated literal string.
1874 //
1875 // Optionally, split the last char of the string off into
1876 // a single "ONE_CHAR" operation, so that quantifiers can
1877 // apply to that char alone. Example: abc*
1878 // The * must apply to the 'c' only.
1879 //
1880 //------------------------------------------------------------------------------
fixLiterals(UBool split)1881 void RegexCompile::fixLiterals(UBool split) {
1882
1883 // If no literal characters have been scanned but not yet had code generated
1884 // for them, nothing needs to be done.
1885 if (fLiteralChars.length() == 0) {
1886 return;
1887 }
1888
1889 int32_t indexOfLastCodePoint = fLiteralChars.moveIndex32(fLiteralChars.length(), -1);
1890 UChar32 lastCodePoint = fLiteralChars.char32At(indexOfLastCodePoint);
1891
1892 // Split: We need to ensure that the last item in the compiled pattern
1893 // refers only to the last literal scanned in the pattern, so that
1894 // quantifiers (*, +, etc.) affect only it, and not a longer string.
1895 // Split before case folding for case insensitive matches.
1896
1897 if (split) {
1898 fLiteralChars.truncate(indexOfLastCodePoint);
1899 fixLiterals(FALSE); // Recursive call, emit code to match the first part of the string.
1900 // Note that the truncated literal string may be empty, in which case
1901 // nothing will be emitted.
1902
1903 literalChar(lastCodePoint); // Re-add the last code point as if it were a new literal.
1904 fixLiterals(FALSE); // Second recursive call, code for the final code point.
1905 return;
1906 }
1907
1908 // If we are doing case-insensitive matching, case fold the string. This may expand
1909 // the string, e.g. the German sharp-s turns into "ss"
1910 if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
1911 fLiteralChars.foldCase();
1912 indexOfLastCodePoint = fLiteralChars.moveIndex32(fLiteralChars.length(), -1);
1913 lastCodePoint = fLiteralChars.char32At(indexOfLastCodePoint);
1914 }
1915
1916 if (indexOfLastCodePoint == 0) {
1917 // Single character, emit a URX_ONECHAR op to match it.
1918 if ((fModeFlags & UREGEX_CASE_INSENSITIVE) &&
1919 u_hasBinaryProperty(lastCodePoint, UCHAR_CASE_SENSITIVE)) {
1920 appendOp(URX_ONECHAR_I, lastCodePoint);
1921 } else {
1922 appendOp(URX_ONECHAR, lastCodePoint);
1923 }
1924 } else {
1925 // Two or more chars, emit a URX_STRING to match them.
1926 if (fLiteralChars.length() > 0x00ffffff || fRXPat->fLiteralText.length() > 0x00ffffff) {
1927 error(U_REGEX_PATTERN_TOO_BIG);
1928 }
1929 if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
1930 appendOp(URX_STRING_I, fRXPat->fLiteralText.length());
1931 } else {
1932 // TODO here: add optimization to split case sensitive strings of length two
1933 // into two single char ops, for efficiency.
1934 appendOp(URX_STRING, fRXPat->fLiteralText.length());
1935 }
1936 appendOp(URX_STRING_LEN, fLiteralChars.length());
1937
1938 // Add this string into the accumulated strings of the compiled pattern.
1939 fRXPat->fLiteralText.append(fLiteralChars);
1940 }
1941
1942 fLiteralChars.remove();
1943 }
1944
1945
buildOp(int32_t type,int32_t val)1946 int32_t RegexCompile::buildOp(int32_t type, int32_t val) {
1947 if (U_FAILURE(*fStatus)) {
1948 return 0;
1949 }
1950 if (type < 0 || type > 255) {
1951 U_ASSERT(FALSE);
1952 error(U_REGEX_INTERNAL_ERROR);
1953 type = URX_RESERVED_OP;
1954 }
1955 if (val > 0x00ffffff) {
1956 U_ASSERT(FALSE);
1957 error(U_REGEX_INTERNAL_ERROR);
1958 val = 0;
1959 }
1960 if (val < 0) {
1961 if (!(type == URX_RESERVED_OP_N || type == URX_RESERVED_OP)) {
1962 U_ASSERT(FALSE);
1963 error(U_REGEX_INTERNAL_ERROR);
1964 return -1;
1965 }
1966 if (URX_TYPE(val) != 0xff) {
1967 U_ASSERT(FALSE);
1968 error(U_REGEX_INTERNAL_ERROR);
1969 return -1;
1970 }
1971 type = URX_RESERVED_OP_N;
1972 }
1973 return (type << 24) | val;
1974 }
1975
1976
1977 //------------------------------------------------------------------------------
1978 //
1979 // appendOp() Append a new instruction onto the compiled pattern
1980 // Includes error checking, limiting the size of the
1981 // pattern to lengths that can be represented in the
1982 // 24 bit operand field of an instruction.
1983 //
1984 //------------------------------------------------------------------------------
appendOp(int32_t op)1985 void RegexCompile::appendOp(int32_t op) {
1986 if (U_FAILURE(*fStatus)) {
1987 return;
1988 }
1989 fRXPat->fCompiledPat->addElement(op, *fStatus);
1990 if ((fRXPat->fCompiledPat->size() > 0x00fffff0) && U_SUCCESS(*fStatus)) {
1991 error(U_REGEX_PATTERN_TOO_BIG);
1992 }
1993 }
1994
appendOp(int32_t type,int32_t val)1995 void RegexCompile::appendOp(int32_t type, int32_t val) {
1996 appendOp(buildOp(type, val));
1997 }
1998
1999
2000 //------------------------------------------------------------------------------
2001 //
2002 // insertOp() Insert a slot for a new opcode into the already
2003 // compiled pattern code.
2004 //
2005 // Fill the slot with a NOP. Our caller will replace it
2006 // with what they really wanted.
2007 //
2008 //------------------------------------------------------------------------------
insertOp(int32_t where)2009 void RegexCompile::insertOp(int32_t where) {
2010 UVector64 *code = fRXPat->fCompiledPat;
2011 U_ASSERT(where>0 && where < code->size());
2012
2013 int32_t nop = buildOp(URX_NOP, 0);
2014 code->insertElementAt(nop, where, *fStatus);
2015
2016 // Walk through the pattern, looking for any ops with targets that
2017 // were moved down by the insert. Fix them.
2018 int32_t loc;
2019 for (loc=0; loc<code->size(); loc++) {
2020 int32_t op = (int32_t)code->elementAti(loc);
2021 int32_t opType = URX_TYPE(op);
2022 int32_t opValue = URX_VAL(op);
2023 if ((opType == URX_JMP ||
2024 opType == URX_JMPX ||
2025 opType == URX_STATE_SAVE ||
2026 opType == URX_CTR_LOOP ||
2027 opType == URX_CTR_LOOP_NG ||
2028 opType == URX_JMP_SAV ||
2029 opType == URX_JMP_SAV_X ||
2030 opType == URX_RELOC_OPRND) && opValue > where) {
2031 // Target location for this opcode is after the insertion point and
2032 // needs to be incremented to adjust for the insertion.
2033 opValue++;
2034 op = buildOp(opType, opValue);
2035 code->setElementAt(op, loc);
2036 }
2037 }
2038
2039 // Now fix up the parentheses stack. All positive values in it are locations in
2040 // the compiled pattern. (Negative values are frame boundaries, and don't need fixing.)
2041 for (loc=0; loc<fParenStack.size(); loc++) {
2042 int32_t x = fParenStack.elementAti(loc);
2043 U_ASSERT(x < code->size());
2044 if (x>where) {
2045 x++;
2046 fParenStack.setElementAt(x, loc);
2047 }
2048 }
2049
2050 if (fMatchCloseParen > where) {
2051 fMatchCloseParen++;
2052 }
2053 if (fMatchOpenParen > where) {
2054 fMatchOpenParen++;
2055 }
2056 }
2057
2058
2059 //------------------------------------------------------------------------------
2060 //
2061 // allocateData() Allocate storage in the matcher's static data area.
2062 // Return the index for the newly allocated data.
2063 // The storage won't actually exist until we are running a match
2064 // operation, but the storage indexes are inserted into various
2065 // opcodes while compiling the pattern.
2066 //
2067 //------------------------------------------------------------------------------
allocateData(int32_t size)2068 int32_t RegexCompile::allocateData(int32_t size) {
2069 if (U_FAILURE(*fStatus)) {
2070 return 0;
2071 }
2072 if (size <= 0 || size > 0x100 || fRXPat->fDataSize < 0) {
2073 error(U_REGEX_INTERNAL_ERROR);
2074 return 0;
2075 }
2076 int32_t dataIndex = fRXPat->fDataSize;
2077 fRXPat->fDataSize += size;
2078 if (fRXPat->fDataSize >= 0x00fffff0) {
2079 error(U_REGEX_INTERNAL_ERROR);
2080 }
2081 return dataIndex;
2082 }
2083
2084
2085 //------------------------------------------------------------------------------
2086 //
2087 // allocateStackData() Allocate space in the back-tracking stack frame.
2088 // Return the index for the newly allocated data.
2089 // The frame indexes are inserted into various
2090 // opcodes while compiling the pattern, meaning that frame
2091 // size must be restricted to the size that will fit
2092 // as an operand (24 bits).
2093 //
2094 //------------------------------------------------------------------------------
allocateStackData(int32_t size)2095 int32_t RegexCompile::allocateStackData(int32_t size) {
2096 if (U_FAILURE(*fStatus)) {
2097 return 0;
2098 }
2099 if (size <= 0 || size > 0x100 || fRXPat->fFrameSize < 0) {
2100 error(U_REGEX_INTERNAL_ERROR);
2101 return 0;
2102 }
2103 int32_t dataIndex = fRXPat->fFrameSize;
2104 fRXPat->fFrameSize += size;
2105 if (fRXPat->fFrameSize >= 0x00fffff0) {
2106 error(U_REGEX_PATTERN_TOO_BIG);
2107 }
2108 return dataIndex;
2109 }
2110
2111
2112 //------------------------------------------------------------------------------
2113 //
2114 // blockTopLoc() Find or create a location in the compiled pattern
2115 // at the start of the operation or block that has
2116 // just been compiled. Needed when a quantifier (* or
2117 // whatever) appears, and we need to add an operation
2118 // at the start of the thing being quantified.
2119 //
2120 // (Parenthesized Blocks) have a slot with a NOP that
2121 // is reserved for this purpose. .* or similar don't
2122 // and a slot needs to be added.
2123 //
2124 // parameter reserveLoc : TRUE - ensure that there is space to add an opcode
2125 // at the returned location.
2126 // FALSE - just return the address,
2127 // do not reserve a location there.
2128 //
2129 //------------------------------------------------------------------------------
blockTopLoc(UBool reserveLoc)2130 int32_t RegexCompile::blockTopLoc(UBool reserveLoc) {
2131 int32_t theLoc;
2132 fixLiterals(TRUE); // Emit code for any pending literals.
2133 // If last item was a string, emit separate op for the its last char.
2134 if (fRXPat->fCompiledPat->size() == fMatchCloseParen)
2135 {
2136 // The item just processed is a parenthesized block.
2137 theLoc = fMatchOpenParen; // A slot is already reserved for us.
2138 U_ASSERT(theLoc > 0);
2139 U_ASSERT(URX_TYPE(((uint32_t)fRXPat->fCompiledPat->elementAti(theLoc))) == URX_NOP);
2140 }
2141 else {
2142 // Item just compiled is a single thing, a ".", or a single char, a string or a set reference.
2143 // No slot for STATE_SAVE was pre-reserved in the compiled code.
2144 // We need to make space now.
2145 theLoc = fRXPat->fCompiledPat->size()-1;
2146 int32_t opAtTheLoc = (int32_t)fRXPat->fCompiledPat->elementAti(theLoc);
2147 if (URX_TYPE(opAtTheLoc) == URX_STRING_LEN) {
2148 // Strings take two opcode, we want the position of the first one.
2149 // We can have a string at this point if a single character case-folded to two.
2150 theLoc--;
2151 }
2152 if (reserveLoc) {
2153 int32_t nop = buildOp(URX_NOP, 0);
2154 fRXPat->fCompiledPat->insertElementAt(nop, theLoc, *fStatus);
2155 }
2156 }
2157 return theLoc;
2158 }
2159
2160
2161
2162 //------------------------------------------------------------------------------
2163 //
2164 // handleCloseParen When compiling a close paren, we need to go back
2165 // and fix up any JMP or SAVE operations within the
2166 // parenthesized block that need to target the end
2167 // of the block. The locations of these are kept on
2168 // the paretheses stack.
2169 //
2170 // This function is called both when encountering a
2171 // real ) and at the end of the pattern.
2172 //
2173 //------------------------------------------------------------------------------
handleCloseParen()2174 void RegexCompile::handleCloseParen() {
2175 int32_t patIdx;
2176 int32_t patOp;
2177 if (fParenStack.size() <= 0) {
2178 error(U_REGEX_MISMATCHED_PAREN);
2179 return;
2180 }
2181
2182 // Emit code for any pending literals.
2183 fixLiterals(FALSE);
2184
2185 // Fixup any operations within the just-closed parenthesized group
2186 // that need to reference the end of the (block).
2187 // (The first one popped from the stack is an unused slot for
2188 // alternation (OR) state save, but applying the fixup to it does no harm.)
2189 for (;;) {
2190 patIdx = fParenStack.popi();
2191 if (patIdx < 0) {
2192 // value < 0 flags the start of the frame on the paren stack.
2193 break;
2194 }
2195 U_ASSERT(patIdx>0 && patIdx <= fRXPat->fCompiledPat->size());
2196 patOp = (int32_t)fRXPat->fCompiledPat->elementAti(patIdx);
2197 U_ASSERT(URX_VAL(patOp) == 0); // Branch target for JMP should not be set.
2198 patOp |= fRXPat->fCompiledPat->size(); // Set it now.
2199 fRXPat->fCompiledPat->setElementAt(patOp, patIdx);
2200 fMatchOpenParen = patIdx;
2201 }
2202
2203 // At the close of any parenthesized block, restore the match mode flags to
2204 // the value they had at the open paren. Saved value is
2205 // at the top of the paren stack.
2206 fModeFlags = fParenStack.popi();
2207 U_ASSERT(fModeFlags < 0);
2208
2209 // DO any additional fixups, depending on the specific kind of
2210 // parentesized grouping this is
2211
2212 switch (patIdx) {
2213 case plain:
2214 case flags:
2215 // No additional fixups required.
2216 // (Grouping-only parentheses)
2217 break;
2218 case capturing:
2219 // Capturing Parentheses.
2220 // Insert a End Capture op into the pattern.
2221 // The frame offset of the variables for this cg is obtained from the
2222 // start capture op and put it into the end-capture op.
2223 {
2224 int32_t captureOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen+1);
2225 U_ASSERT(URX_TYPE(captureOp) == URX_START_CAPTURE);
2226
2227 int32_t frameVarLocation = URX_VAL(captureOp);
2228 appendOp(URX_END_CAPTURE, frameVarLocation);
2229 }
2230 break;
2231 case atomic:
2232 // Atomic Parenthesis.
2233 // Insert a LD_SP operation to restore the state stack to the position
2234 // it was when the atomic parens were entered.
2235 {
2236 int32_t stoOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen+1);
2237 U_ASSERT(URX_TYPE(stoOp) == URX_STO_SP);
2238 int32_t stoLoc = URX_VAL(stoOp);
2239 appendOp(URX_LD_SP, stoLoc);
2240 }
2241 break;
2242
2243 case lookAhead:
2244 {
2245 int32_t startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-5);
2246 U_ASSERT(URX_TYPE(startOp) == URX_LA_START);
2247 int32_t dataLoc = URX_VAL(startOp);
2248 appendOp(URX_LA_END, dataLoc);
2249 }
2250 break;
2251
2252 case negLookAhead:
2253 {
2254 // See comment at doOpenLookAheadNeg
2255 int32_t startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-1);
2256 U_ASSERT(URX_TYPE(startOp) == URX_LA_START);
2257 int32_t dataLoc = URX_VAL(startOp);
2258 appendOp(URX_LA_END, dataLoc);
2259 appendOp(URX_BACKTRACK, 0);
2260 appendOp(URX_LA_END, dataLoc);
2261
2262 // Patch the URX_SAVE near the top of the block.
2263 // The destination of the SAVE is the final LA_END that was just added.
2264 int32_t saveOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen);
2265 U_ASSERT(URX_TYPE(saveOp) == URX_STATE_SAVE);
2266 int32_t dest = fRXPat->fCompiledPat->size()-1;
2267 saveOp = buildOp(URX_STATE_SAVE, dest);
2268 fRXPat->fCompiledPat->setElementAt(saveOp, fMatchOpenParen);
2269 }
2270 break;
2271
2272 case lookBehind:
2273 {
2274 // See comment at doOpenLookBehind.
2275
2276 // Append the URX_LB_END and URX_LA_END to the compiled pattern.
2277 int32_t startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-4);
2278 U_ASSERT(URX_TYPE(startOp) == URX_LB_START);
2279 int32_t dataLoc = URX_VAL(startOp);
2280 appendOp(URX_LB_END, dataLoc);
2281 appendOp(URX_LA_END, dataLoc);
2282
2283 // Determine the min and max bounds for the length of the
2284 // string that the pattern can match.
2285 // An unbounded upper limit is an error.
2286 int32_t patEnd = fRXPat->fCompiledPat->size() - 1;
2287 int32_t minML = minMatchLength(fMatchOpenParen, patEnd);
2288 int32_t maxML = maxMatchLength(fMatchOpenParen, patEnd);
2289 if (URX_TYPE(maxML) != 0) {
2290 error(U_REGEX_LOOK_BEHIND_LIMIT);
2291 break;
2292 }
2293 if (maxML == INT32_MAX) {
2294 error(U_REGEX_LOOK_BEHIND_LIMIT);
2295 break;
2296 }
2297 U_ASSERT(minML <= maxML);
2298
2299 // Insert the min and max match len bounds into the URX_LB_CONT op that
2300 // appears at the top of the look-behind block, at location fMatchOpenParen+1
2301 fRXPat->fCompiledPat->setElementAt(minML, fMatchOpenParen-2);
2302 fRXPat->fCompiledPat->setElementAt(maxML, fMatchOpenParen-1);
2303
2304 }
2305 break;
2306
2307
2308
2309 case lookBehindN:
2310 {
2311 // See comment at doOpenLookBehindNeg.
2312
2313 // Append the URX_LBN_END to the compiled pattern.
2314 int32_t startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-5);
2315 U_ASSERT(URX_TYPE(startOp) == URX_LB_START);
2316 int32_t dataLoc = URX_VAL(startOp);
2317 appendOp(URX_LBN_END, dataLoc);
2318
2319 // Determine the min and max bounds for the length of the
2320 // string that the pattern can match.
2321 // An unbounded upper limit is an error.
2322 int32_t patEnd = fRXPat->fCompiledPat->size() - 1;
2323 int32_t minML = minMatchLength(fMatchOpenParen, patEnd);
2324 int32_t maxML = maxMatchLength(fMatchOpenParen, patEnd);
2325 if (URX_TYPE(maxML) != 0) {
2326 error(U_REGEX_LOOK_BEHIND_LIMIT);
2327 break;
2328 }
2329 if (maxML == INT32_MAX) {
2330 error(U_REGEX_LOOK_BEHIND_LIMIT);
2331 break;
2332 }
2333 U_ASSERT(minML <= maxML);
2334
2335 // Insert the min and max match len bounds into the URX_LB_CONT op that
2336 // appears at the top of the look-behind block, at location fMatchOpenParen+1
2337 fRXPat->fCompiledPat->setElementAt(minML, fMatchOpenParen-3);
2338 fRXPat->fCompiledPat->setElementAt(maxML, fMatchOpenParen-2);
2339
2340 // Insert the pattern location to continue at after a successful match
2341 // as the last operand of the URX_LBN_CONT
2342 int32_t op = buildOp(URX_RELOC_OPRND, fRXPat->fCompiledPat->size());
2343 fRXPat->fCompiledPat->setElementAt(op, fMatchOpenParen-1);
2344 }
2345 break;
2346
2347
2348
2349 default:
2350 U_ASSERT(FALSE);
2351 }
2352
2353 // remember the next location in the compiled pattern.
2354 // The compilation of Quantifiers will look at this to see whether its looping
2355 // over a parenthesized block or a single item
2356 fMatchCloseParen = fRXPat->fCompiledPat->size();
2357 }
2358
2359
2360
2361 //------------------------------------------------------------------------------
2362 //
2363 // compileSet Compile the pattern operations for a reference to a
2364 // UnicodeSet.
2365 //
2366 //------------------------------------------------------------------------------
compileSet(UnicodeSet * theSet)2367 void RegexCompile::compileSet(UnicodeSet *theSet)
2368 {
2369 if (theSet == NULL) {
2370 return;
2371 }
2372 // Remove any strings from the set.
2373 // There shoudn't be any, but just in case.
2374 // (Case Closure can add them; if we had a simple case closure avaialble that
2375 // ignored strings, that would be better.)
2376 theSet->removeAllStrings();
2377 int32_t setSize = theSet->size();
2378
2379 switch (setSize) {
2380 case 0:
2381 {
2382 // Set of no elements. Always fails to match.
2383 appendOp(URX_BACKTRACK, 0);
2384 delete theSet;
2385 }
2386 break;
2387
2388 case 1:
2389 {
2390 // The set contains only a single code point. Put it into
2391 // the compiled pattern as a single char operation rather
2392 // than a set, and discard the set itself.
2393 literalChar(theSet->charAt(0));
2394 delete theSet;
2395 }
2396 break;
2397
2398 default:
2399 {
2400 // The set contains two or more chars. (the normal case)
2401 // Put it into the compiled pattern as a set.
2402 int32_t setNumber = fRXPat->fSets->size();
2403 fRXPat->fSets->addElement(theSet, *fStatus);
2404 appendOp(URX_SETREF, setNumber);
2405 }
2406 }
2407 }
2408
2409
2410 //------------------------------------------------------------------------------
2411 //
2412 // compileInterval Generate the code for a {min, max} style interval quantifier.
2413 // Except for the specific opcodes used, the code is the same
2414 // for all three types (greedy, non-greedy, possessive) of
2415 // intervals. The opcodes are supplied as parameters.
2416 // (There are two sets of opcodes - greedy & possessive use the
2417 // same ones, while non-greedy has it's own.)
2418 //
2419 // The code for interval loops has this form:
2420 // 0 CTR_INIT counter loc (in stack frame)
2421 // 1 5 patt address of CTR_LOOP at bottom of block
2422 // 2 min count
2423 // 3 max count (-1 for unbounded)
2424 // 4 ... block to be iterated over
2425 // 5 CTR_LOOP
2426 //
2427 // In
2428 //------------------------------------------------------------------------------
compileInterval(int32_t InitOp,int32_t LoopOp)2429 void RegexCompile::compileInterval(int32_t InitOp, int32_t LoopOp)
2430 {
2431 // The CTR_INIT op at the top of the block with the {n,m} quantifier takes
2432 // four slots in the compiled code. Reserve them.
2433 int32_t topOfBlock = blockTopLoc(TRUE);
2434 insertOp(topOfBlock);
2435 insertOp(topOfBlock);
2436 insertOp(topOfBlock);
2437
2438 // The operands for the CTR_INIT opcode include the index in the matcher data
2439 // of the counter. Allocate it now. There are two data items
2440 // counterLoc --> Loop counter
2441 // +1 --> Input index (for breaking non-progressing loops)
2442 // (Only present if unbounded upper limit on loop)
2443 int32_t dataSize = fIntervalUpper < 0 ? 2 : 1;
2444 int32_t counterLoc = allocateStackData(dataSize);
2445
2446 int32_t op = buildOp(InitOp, counterLoc);
2447 fRXPat->fCompiledPat->setElementAt(op, topOfBlock);
2448
2449 // The second operand of CTR_INIT is the location following the end of the loop.
2450 // Must put in as a URX_RELOC_OPRND so that the value will be adjusted if the
2451 // compilation of something later on causes the code to grow and the target
2452 // position to move.
2453 int32_t loopEnd = fRXPat->fCompiledPat->size();
2454 op = buildOp(URX_RELOC_OPRND, loopEnd);
2455 fRXPat->fCompiledPat->setElementAt(op, topOfBlock+1);
2456
2457 // Followed by the min and max counts.
2458 fRXPat->fCompiledPat->setElementAt(fIntervalLow, topOfBlock+2);
2459 fRXPat->fCompiledPat->setElementAt(fIntervalUpper, topOfBlock+3);
2460
2461 // Apend the CTR_LOOP op. The operand is the location of the CTR_INIT op.
2462 // Goes at end of the block being looped over, so just append to the code so far.
2463 appendOp(LoopOp, topOfBlock);
2464
2465 if ((fIntervalLow & 0xff000000) != 0 ||
2466 (fIntervalUpper > 0 && (fIntervalUpper & 0xff000000) != 0)) {
2467 error(U_REGEX_NUMBER_TOO_BIG);
2468 }
2469
2470 if (fIntervalLow > fIntervalUpper && fIntervalUpper != -1) {
2471 error(U_REGEX_MAX_LT_MIN);
2472 }
2473 }
2474
2475
2476
compileInlineInterval()2477 UBool RegexCompile::compileInlineInterval() {
2478 if (fIntervalUpper > 10 || fIntervalUpper < fIntervalLow) {
2479 // Too big to inline. Fail, which will cause looping code to be generated.
2480 // (Upper < Lower picks up unbounded upper and errors, both.)
2481 return FALSE;
2482 }
2483
2484 int32_t topOfBlock = blockTopLoc(FALSE);
2485 if (fIntervalUpper == 0) {
2486 // Pathological case. Attempt no matches, as if the block doesn't exist.
2487 // Discard the generated code for the block.
2488 // If the block included parens, discard the info pertaining to them as well.
2489 fRXPat->fCompiledPat->setSize(topOfBlock);
2490 if (fMatchOpenParen >= topOfBlock) {
2491 fMatchOpenParen = -1;
2492 }
2493 if (fMatchCloseParen >= topOfBlock) {
2494 fMatchCloseParen = -1;
2495 }
2496 return TRUE;
2497 }
2498
2499 if (topOfBlock != fRXPat->fCompiledPat->size()-1 && fIntervalUpper != 1) {
2500 // The thing being repeated is not a single op, but some
2501 // more complex block. Do it as a loop, not inlines.
2502 // Note that things "repeated" a max of once are handled as inline, because
2503 // the one copy of the code already generated is just fine.
2504 return FALSE;
2505 }
2506
2507 // Pick up the opcode that is to be repeated
2508 //
2509 int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(topOfBlock);
2510
2511 // Compute the pattern location where the inline sequence
2512 // will end, and set up the state save op that will be needed.
2513 //
2514 int32_t endOfSequenceLoc = fRXPat->fCompiledPat->size()-1
2515 + fIntervalUpper + (fIntervalUpper-fIntervalLow);
2516 int32_t saveOp = buildOp(URX_STATE_SAVE, endOfSequenceLoc);
2517 if (fIntervalLow == 0) {
2518 insertOp(topOfBlock);
2519 fRXPat->fCompiledPat->setElementAt(saveOp, topOfBlock);
2520 }
2521
2522
2523
2524 // Loop, emitting the op for the thing being repeated each time.
2525 // Loop starts at 1 because one instance of the op already exists in the pattern,
2526 // it was put there when it was originally encountered.
2527 int32_t i;
2528 for (i=1; i<fIntervalUpper; i++ ) {
2529 if (i >= fIntervalLow) {
2530 appendOp(saveOp);
2531 }
2532 appendOp(op);
2533 }
2534 return TRUE;
2535 }
2536
2537
2538
2539 //------------------------------------------------------------------------------
2540 //
2541 // caseInsensitiveStart given a single code point from a pattern string, determine the
2542 // set of characters that could potentially begin a case-insensitive
2543 // match of a string beginning with that character, using full Unicode
2544 // case insensitive matching.
2545 //
2546 // This is used in optimizing find().
2547 //
2548 // closeOver(USET_CASE_INSENSITIVE) does most of what is needed, but
2549 // misses cases like this:
2550 // A string from the pattern begins with 'ss' (although all we know
2551 // in this context is that it begins with 's')
2552 // The pattern could match a string beginning with a German sharp-s
2553 //
2554 // To the ordinary case closure for a character c, we add all other
2555 // characters cx where the case closure of cx incudes a string form that begins
2556 // with the original character c.
2557 //
2558 // This function could be made smarter. The full pattern string is available
2559 // and it would be possible to verify that the extra characters being added
2560 // to the starting set fully match, rather than having just a first-char of the
2561 // folded form match.
2562 //
2563 //------------------------------------------------------------------------------
findCaseInsensitiveStarters(UChar32 c,UnicodeSet * starterChars)2564 void RegexCompile::findCaseInsensitiveStarters(UChar32 c, UnicodeSet *starterChars) {
2565
2566 // Machine Generated below.
2567 // It may need updating with new versions of Unicode.
2568 // Intltest test RegexTest::TestCaseInsensitiveStarters will fail if an update is needed.
2569 // The update tool is here: svn+ssh://source.icu-project.org/repos/icu/tools/trunk/unicode/c/genregexcasing
2570
2571 // Machine Generated Data. Do not hand edit.
2572 static const UChar32 RECaseFixCodePoints[] = {
2573 0x61, 0x66, 0x68, 0x69, 0x6a, 0x73, 0x74, 0x77, 0x79, 0x2bc,
2574 0x3ac, 0x3ae, 0x3b1, 0x3b7, 0x3b9, 0x3c1, 0x3c5, 0x3c9, 0x3ce, 0x565,
2575 0x574, 0x57e, 0x1f00, 0x1f01, 0x1f02, 0x1f03, 0x1f04, 0x1f05, 0x1f06, 0x1f07,
2576 0x1f20, 0x1f21, 0x1f22, 0x1f23, 0x1f24, 0x1f25, 0x1f26, 0x1f27, 0x1f60, 0x1f61,
2577 0x1f62, 0x1f63, 0x1f64, 0x1f65, 0x1f66, 0x1f67, 0x1f70, 0x1f74, 0x1f7c, 0x110000};
2578
2579 static const int16_t RECaseFixStringOffsets[] = {
2580 0x0, 0x1, 0x6, 0x7, 0x8, 0x9, 0xd, 0xe, 0xf, 0x10,
2581 0x11, 0x12, 0x13, 0x17, 0x1b, 0x20, 0x21, 0x2a, 0x2e, 0x2f,
2582 0x30, 0x34, 0x35, 0x37, 0x39, 0x3b, 0x3d, 0x3f, 0x41, 0x43,
2583 0x45, 0x47, 0x49, 0x4b, 0x4d, 0x4f, 0x51, 0x53, 0x55, 0x57,
2584 0x59, 0x5b, 0x5d, 0x5f, 0x61, 0x63, 0x65, 0x66, 0x67, 0};
2585
2586 static const int16_t RECaseFixCounts[] = {
2587 0x1, 0x5, 0x1, 0x1, 0x1, 0x4, 0x1, 0x1, 0x1, 0x1,
2588 0x1, 0x1, 0x4, 0x4, 0x5, 0x1, 0x9, 0x4, 0x1, 0x1,
2589 0x4, 0x1, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2,
2590 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2,
2591 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x1, 0x1, 0x1, 0};
2592
2593 static const UChar RECaseFixData[] = {
2594 0x1e9a, 0xfb00, 0xfb01, 0xfb02, 0xfb03, 0xfb04, 0x1e96, 0x130, 0x1f0, 0xdf,
2595 0x1e9e, 0xfb05, 0xfb06, 0x1e97, 0x1e98, 0x1e99, 0x149, 0x1fb4, 0x1fc4, 0x1fb3,
2596 0x1fb6, 0x1fb7, 0x1fbc, 0x1fc3, 0x1fc6, 0x1fc7, 0x1fcc, 0x390, 0x1fd2, 0x1fd3,
2597 0x1fd6, 0x1fd7, 0x1fe4, 0x3b0, 0x1f50, 0x1f52, 0x1f54, 0x1f56, 0x1fe2, 0x1fe3,
2598 0x1fe6, 0x1fe7, 0x1ff3, 0x1ff6, 0x1ff7, 0x1ffc, 0x1ff4, 0x587, 0xfb13, 0xfb14,
2599 0xfb15, 0xfb17, 0xfb16, 0x1f80, 0x1f88, 0x1f81, 0x1f89, 0x1f82, 0x1f8a, 0x1f83,
2600 0x1f8b, 0x1f84, 0x1f8c, 0x1f85, 0x1f8d, 0x1f86, 0x1f8e, 0x1f87, 0x1f8f, 0x1f90,
2601 0x1f98, 0x1f91, 0x1f99, 0x1f92, 0x1f9a, 0x1f93, 0x1f9b, 0x1f94, 0x1f9c, 0x1f95,
2602 0x1f9d, 0x1f96, 0x1f9e, 0x1f97, 0x1f9f, 0x1fa0, 0x1fa8, 0x1fa1, 0x1fa9, 0x1fa2,
2603 0x1faa, 0x1fa3, 0x1fab, 0x1fa4, 0x1fac, 0x1fa5, 0x1fad, 0x1fa6, 0x1fae, 0x1fa7,
2604 0x1faf, 0x1fb2, 0x1fc2, 0x1ff2, 0};
2605
2606 // End of machine generated data.
2607
2608 if (c < UCHAR_MIN_VALUE || c > UCHAR_MAX_VALUE) {
2609 // This function should never be called with an invalid input character.
2610 U_ASSERT(FALSE);
2611 starterChars->clear();
2612 } else if (u_hasBinaryProperty(c, UCHAR_CASE_SENSITIVE)) {
2613 UChar32 caseFoldedC = u_foldCase(c, U_FOLD_CASE_DEFAULT);
2614 starterChars->set(caseFoldedC, caseFoldedC);
2615
2616 int32_t i;
2617 for (i=0; RECaseFixCodePoints[i]<c ; i++) {
2618 // Simple linear search through the sorted list of interesting code points.
2619 }
2620
2621 if (RECaseFixCodePoints[i] == c) {
2622 int32_t dataIndex = RECaseFixStringOffsets[i];
2623 int32_t numCharsToAdd = RECaseFixCounts[i];
2624 UChar32 cpToAdd = 0;
2625 for (int32_t j=0; j<numCharsToAdd; j++) {
2626 U16_NEXT_UNSAFE(RECaseFixData, dataIndex, cpToAdd);
2627 starterChars->add(cpToAdd);
2628 }
2629 }
2630
2631 starterChars->closeOver(USET_CASE_INSENSITIVE);
2632 starterChars->removeAllStrings();
2633 } else {
2634 // Not a cased character. Just return it alone.
2635 starterChars->set(c, c);
2636 }
2637 }
2638
2639
2640
2641
2642 //------------------------------------------------------------------------------
2643 //
2644 // matchStartType Determine how a match can start.
2645 // Used to optimize find() operations.
2646 //
2647 // Operation is very similar to minMatchLength(). Walk the compiled
2648 // pattern, keeping an on-going minimum-match-length. For any
2649 // op where the min match coming in is zero, add that ops possible
2650 // starting matches to the possible starts for the overall pattern.
2651 //
2652 //------------------------------------------------------------------------------
matchStartType()2653 void RegexCompile::matchStartType() {
2654 if (U_FAILURE(*fStatus)) {
2655 return;
2656 }
2657
2658
2659 int32_t loc; // Location in the pattern of the current op being processed.
2660 int32_t op; // The op being processed
2661 int32_t opType; // The opcode type of the op
2662 int32_t currentLen = 0; // Minimum length of a match to this point (loc) in the pattern
2663 int32_t numInitialStrings = 0; // Number of strings encountered that could match at start.
2664
2665 UBool atStart = TRUE; // True if no part of the pattern yet encountered
2666 // could have advanced the position in a match.
2667 // (Maximum match length so far == 0)
2668
2669 // forwardedLength is a vector holding minimum-match-length values that
2670 // are propagated forward in the pattern by JMP or STATE_SAVE operations.
2671 // It must be one longer than the pattern being checked because some ops
2672 // will jmp to a end-of-block+1 location from within a block, and we must
2673 // count those when checking the block.
2674 int32_t end = fRXPat->fCompiledPat->size();
2675 UVector32 forwardedLength(end+1, *fStatus);
2676 forwardedLength.setSize(end+1);
2677 for (loc=3; loc<end; loc++) {
2678 forwardedLength.setElementAt(INT32_MAX, loc);
2679 }
2680
2681 for (loc = 3; loc<end; loc++) {
2682 op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
2683 opType = URX_TYPE(op);
2684
2685 // The loop is advancing linearly through the pattern.
2686 // If the op we are now at was the destination of a branch in the pattern,
2687 // and that path has a shorter minimum length than the current accumulated value,
2688 // replace the current accumulated value.
2689 if (forwardedLength.elementAti(loc) < currentLen) {
2690 currentLen = forwardedLength.elementAti(loc);
2691 U_ASSERT(currentLen>=0 && currentLen < INT32_MAX);
2692 }
2693
2694 switch (opType) {
2695 // Ops that don't change the total length matched
2696 case URX_RESERVED_OP:
2697 case URX_END:
2698 case URX_FAIL:
2699 case URX_STRING_LEN:
2700 case URX_NOP:
2701 case URX_START_CAPTURE:
2702 case URX_END_CAPTURE:
2703 case URX_BACKSLASH_B:
2704 case URX_BACKSLASH_BU:
2705 case URX_BACKSLASH_G:
2706 case URX_BACKSLASH_Z:
2707 case URX_DOLLAR:
2708 case URX_DOLLAR_M:
2709 case URX_DOLLAR_D:
2710 case URX_DOLLAR_MD:
2711 case URX_RELOC_OPRND:
2712 case URX_STO_INP_LOC:
2713 case URX_BACKREF: // BackRef. Must assume that it might be a zero length match
2714 case URX_BACKREF_I:
2715
2716 case URX_STO_SP: // Setup for atomic or possessive blocks. Doesn't change what can match.
2717 case URX_LD_SP:
2718 break;
2719
2720 case URX_CARET:
2721 if (atStart) {
2722 fRXPat->fStartType = START_START;
2723 }
2724 break;
2725
2726 case URX_CARET_M:
2727 case URX_CARET_M_UNIX:
2728 if (atStart) {
2729 fRXPat->fStartType = START_LINE;
2730 }
2731 break;
2732
2733 case URX_ONECHAR:
2734 if (currentLen == 0) {
2735 // This character could appear at the start of a match.
2736 // Add it to the set of possible starting characters.
2737 fRXPat->fInitialChars->add(URX_VAL(op));
2738 numInitialStrings += 2;
2739 }
2740 currentLen++;
2741 atStart = FALSE;
2742 break;
2743
2744
2745 case URX_SETREF:
2746 if (currentLen == 0) {
2747 int32_t sn = URX_VAL(op);
2748 U_ASSERT(sn > 0 && sn < fRXPat->fSets->size());
2749 const UnicodeSet *s = (UnicodeSet *)fRXPat->fSets->elementAt(sn);
2750 fRXPat->fInitialChars->addAll(*s);
2751 numInitialStrings += 2;
2752 }
2753 currentLen++;
2754 atStart = FALSE;
2755 break;
2756
2757 case URX_LOOP_SR_I:
2758 // [Set]*, like a SETREF, above, in what it can match,
2759 // but may not match at all, so currentLen is not incremented.
2760 if (currentLen == 0) {
2761 int32_t sn = URX_VAL(op);
2762 U_ASSERT(sn > 0 && sn < fRXPat->fSets->size());
2763 const UnicodeSet *s = (UnicodeSet *)fRXPat->fSets->elementAt(sn);
2764 fRXPat->fInitialChars->addAll(*s);
2765 numInitialStrings += 2;
2766 }
2767 atStart = FALSE;
2768 break;
2769
2770 case URX_LOOP_DOT_I:
2771 if (currentLen == 0) {
2772 // .* at the start of a pattern.
2773 // Any character can begin the match.
2774 fRXPat->fInitialChars->clear();
2775 fRXPat->fInitialChars->complement();
2776 numInitialStrings += 2;
2777 }
2778 atStart = FALSE;
2779 break;
2780
2781
2782 case URX_STATIC_SETREF:
2783 if (currentLen == 0) {
2784 int32_t sn = URX_VAL(op);
2785 U_ASSERT(sn>0 && sn<URX_LAST_SET);
2786 const UnicodeSet *s = fRXPat->fStaticSets[sn];
2787 fRXPat->fInitialChars->addAll(*s);
2788 numInitialStrings += 2;
2789 }
2790 currentLen++;
2791 atStart = FALSE;
2792 break;
2793
2794
2795
2796 case URX_STAT_SETREF_N:
2797 if (currentLen == 0) {
2798 int32_t sn = URX_VAL(op);
2799 const UnicodeSet *s = fRXPat->fStaticSets[sn];
2800 UnicodeSet sc(*s);
2801 sc.complement();
2802 fRXPat->fInitialChars->addAll(sc);
2803 numInitialStrings += 2;
2804 }
2805 currentLen++;
2806 atStart = FALSE;
2807 break;
2808
2809
2810
2811 case URX_BACKSLASH_D:
2812 // Digit Char
2813 if (currentLen == 0) {
2814 UnicodeSet s;
2815 s.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ND_MASK, *fStatus);
2816 if (URX_VAL(op) != 0) {
2817 s.complement();
2818 }
2819 fRXPat->fInitialChars->addAll(s);
2820 numInitialStrings += 2;
2821 }
2822 currentLen++;
2823 atStart = FALSE;
2824 break;
2825
2826
2827 case URX_BACKSLASH_H:
2828 // Horiz white space
2829 if (currentLen == 0) {
2830 UnicodeSet s;
2831 s.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ZS_MASK, *fStatus);
2832 s.add((UChar32)9); // Tab
2833 if (URX_VAL(op) != 0) {
2834 s.complement();
2835 }
2836 fRXPat->fInitialChars->addAll(s);
2837 numInitialStrings += 2;
2838 }
2839 currentLen++;
2840 atStart = FALSE;
2841 break;
2842
2843
2844 case URX_BACKSLASH_R: // Any line ending sequence
2845 case URX_BACKSLASH_V: // Any line ending code point, with optional negation
2846 if (currentLen == 0) {
2847 UnicodeSet s;
2848 s.add((UChar32)0x0a, (UChar32)0x0d); // add range
2849 s.add((UChar32)0x85);
2850 s.add((UChar32)0x2028, (UChar32)0x2029);
2851 if (URX_VAL(op) != 0) {
2852 // Complement option applies to URX_BACKSLASH_V only.
2853 s.complement();
2854 }
2855 fRXPat->fInitialChars->addAll(s);
2856 numInitialStrings += 2;
2857 }
2858 currentLen++;
2859 atStart = FALSE;
2860 break;
2861
2862
2863
2864 case URX_ONECHAR_I:
2865 // Case Insensitive Single Character.
2866 if (currentLen == 0) {
2867 UChar32 c = URX_VAL(op);
2868 if (u_hasBinaryProperty(c, UCHAR_CASE_SENSITIVE)) {
2869 UnicodeSet starters(c, c);
2870 starters.closeOver(USET_CASE_INSENSITIVE);
2871 // findCaseInsensitiveStarters(c, &starters);
2872 // For ONECHAR_I, no need to worry about text chars that expand on folding into strings.
2873 // The expanded folding can't match the pattern.
2874 fRXPat->fInitialChars->addAll(starters);
2875 } else {
2876 // Char has no case variants. Just add it as-is to the
2877 // set of possible starting chars.
2878 fRXPat->fInitialChars->add(c);
2879 }
2880 numInitialStrings += 2;
2881 }
2882 currentLen++;
2883 atStart = FALSE;
2884 break;
2885
2886
2887 case URX_BACKSLASH_X: // Grahpeme Cluster. Minimum is 1, max unbounded.
2888 case URX_DOTANY_ALL: // . matches one or two.
2889 case URX_DOTANY:
2890 case URX_DOTANY_UNIX:
2891 if (currentLen == 0) {
2892 // These constructs are all bad news when they appear at the start
2893 // of a match. Any character can begin the match.
2894 fRXPat->fInitialChars->clear();
2895 fRXPat->fInitialChars->complement();
2896 numInitialStrings += 2;
2897 }
2898 currentLen++;
2899 atStart = FALSE;
2900 break;
2901
2902
2903 case URX_JMPX:
2904 loc++; // Except for extra operand on URX_JMPX, same as URX_JMP.
2905 U_FALLTHROUGH;
2906 case URX_JMP:
2907 {
2908 int32_t jmpDest = URX_VAL(op);
2909 if (jmpDest < loc) {
2910 // Loop of some kind. Can safely ignore, the worst that will happen
2911 // is that we understate the true minimum length
2912 currentLen = forwardedLength.elementAti(loc+1);
2913
2914 } else {
2915 // Forward jump. Propagate the current min length to the target loc of the jump.
2916 U_ASSERT(jmpDest <= end+1);
2917 if (forwardedLength.elementAti(jmpDest) > currentLen) {
2918 forwardedLength.setElementAt(currentLen, jmpDest);
2919 }
2920 }
2921 }
2922 atStart = FALSE;
2923 break;
2924
2925 case URX_JMP_SAV:
2926 case URX_JMP_SAV_X:
2927 // Combo of state save to the next loc, + jmp backwards.
2928 // Net effect on min. length computation is nothing.
2929 atStart = FALSE;
2930 break;
2931
2932 case URX_BACKTRACK:
2933 // Fails are kind of like a branch, except that the min length was
2934 // propagated already, by the state save.
2935 currentLen = forwardedLength.elementAti(loc+1);
2936 atStart = FALSE;
2937 break;
2938
2939
2940 case URX_STATE_SAVE:
2941 {
2942 // State Save, for forward jumps, propagate the current minimum.
2943 // of the state save.
2944 int32_t jmpDest = URX_VAL(op);
2945 if (jmpDest > loc) {
2946 if (currentLen < forwardedLength.elementAti(jmpDest)) {
2947 forwardedLength.setElementAt(currentLen, jmpDest);
2948 }
2949 }
2950 }
2951 atStart = FALSE;
2952 break;
2953
2954
2955
2956
2957 case URX_STRING:
2958 {
2959 loc++;
2960 int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
2961 int32_t stringLen = URX_VAL(stringLenOp);
2962 U_ASSERT(URX_TYPE(stringLenOp) == URX_STRING_LEN);
2963 U_ASSERT(stringLenOp >= 2);
2964 if (currentLen == 0) {
2965 // Add the starting character of this string to the set of possible starting
2966 // characters for this pattern.
2967 int32_t stringStartIdx = URX_VAL(op);
2968 UChar32 c = fRXPat->fLiteralText.char32At(stringStartIdx);
2969 fRXPat->fInitialChars->add(c);
2970
2971 // Remember this string. After the entire pattern has been checked,
2972 // if nothing else is identified that can start a match, we'll use it.
2973 numInitialStrings++;
2974 fRXPat->fInitialStringIdx = stringStartIdx;
2975 fRXPat->fInitialStringLen = stringLen;
2976 }
2977
2978 currentLen += stringLen;
2979 atStart = FALSE;
2980 }
2981 break;
2982
2983 case URX_STRING_I:
2984 {
2985 // Case-insensitive string. Unlike exact-match strings, we won't
2986 // attempt a string search for possible match positions. But we
2987 // do update the set of possible starting characters.
2988 loc++;
2989 int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
2990 int32_t stringLen = URX_VAL(stringLenOp);
2991 U_ASSERT(URX_TYPE(stringLenOp) == URX_STRING_LEN);
2992 U_ASSERT(stringLenOp >= 2);
2993 if (currentLen == 0) {
2994 // Add the starting character of this string to the set of possible starting
2995 // characters for this pattern.
2996 int32_t stringStartIdx = URX_VAL(op);
2997 UChar32 c = fRXPat->fLiteralText.char32At(stringStartIdx);
2998 UnicodeSet s;
2999 findCaseInsensitiveStarters(c, &s);
3000 fRXPat->fInitialChars->addAll(s);
3001 numInitialStrings += 2; // Matching on an initial string not possible.
3002 }
3003 currentLen += stringLen;
3004 atStart = FALSE;
3005 }
3006 break;
3007
3008 case URX_CTR_INIT:
3009 case URX_CTR_INIT_NG:
3010 {
3011 // Loop Init Ops. These don't change the min length, but they are 4 word ops
3012 // so location must be updated accordingly.
3013 // Loop Init Ops.
3014 // If the min loop count == 0
3015 // move loc forwards to the end of the loop, skipping over the body.
3016 // If the min count is > 0,
3017 // continue normal processing of the body of the loop.
3018 int32_t loopEndLoc = (int32_t)fRXPat->fCompiledPat->elementAti(loc+1);
3019 loopEndLoc = URX_VAL(loopEndLoc);
3020 int32_t minLoopCount = (int32_t)fRXPat->fCompiledPat->elementAti(loc+2);
3021 if (minLoopCount == 0) {
3022 // Min Loop Count of 0, treat like a forward branch and
3023 // move the current minimum length up to the target
3024 // (end of loop) location.
3025 U_ASSERT(loopEndLoc <= end+1);
3026 if (forwardedLength.elementAti(loopEndLoc) > currentLen) {
3027 forwardedLength.setElementAt(currentLen, loopEndLoc);
3028 }
3029 }
3030 loc+=3; // Skips over operands of CTR_INIT
3031 }
3032 atStart = FALSE;
3033 break;
3034
3035
3036 case URX_CTR_LOOP:
3037 case URX_CTR_LOOP_NG:
3038 // Loop ops.
3039 // The jump is conditional, backwards only.
3040 atStart = FALSE;
3041 break;
3042
3043 case URX_LOOP_C:
3044 // More loop ops. These state-save to themselves.
3045 // don't change the minimum match
3046 atStart = FALSE;
3047 break;
3048
3049
3050 case URX_LA_START:
3051 case URX_LB_START:
3052 {
3053 // Look-around. Scan forward until the matching look-ahead end,
3054 // without processing the look-around block. This is overly pessimistic.
3055
3056 // Keep track of the nesting depth of look-around blocks. Boilerplate code for
3057 // lookahead contains two LA_END instructions, so count goes up by two
3058 // for each LA_START.
3059 int32_t depth = (opType == URX_LA_START? 2: 1);
3060 for (;;) {
3061 loc++;
3062 op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3063 if (URX_TYPE(op) == URX_LA_START) {
3064 depth+=2;
3065 }
3066 if (URX_TYPE(op) == URX_LB_START) {
3067 depth++;
3068 }
3069 if (URX_TYPE(op) == URX_LA_END || URX_TYPE(op)==URX_LBN_END) {
3070 depth--;
3071 if (depth == 0) {
3072 break;
3073 }
3074 }
3075 if (URX_TYPE(op) == URX_STATE_SAVE) {
3076 // Need this because neg lookahead blocks will FAIL to outside
3077 // of the block.
3078 int32_t jmpDest = URX_VAL(op);
3079 if (jmpDest > loc) {
3080 if (currentLen < forwardedLength.elementAti(jmpDest)) {
3081 forwardedLength.setElementAt(currentLen, jmpDest);
3082 }
3083 }
3084 }
3085 U_ASSERT(loc <= end);
3086 }
3087 }
3088 break;
3089
3090 case URX_LA_END:
3091 case URX_LB_CONT:
3092 case URX_LB_END:
3093 case URX_LBN_CONT:
3094 case URX_LBN_END:
3095 U_ASSERT(FALSE); // Shouldn't get here. These ops should be
3096 // consumed by the scan in URX_LA_START and LB_START
3097
3098 break;
3099
3100 default:
3101 U_ASSERT(FALSE);
3102 }
3103
3104 }
3105
3106
3107 // We have finished walking through the ops. Check whether some forward jump
3108 // propagated a shorter length to location end+1.
3109 if (forwardedLength.elementAti(end+1) < currentLen) {
3110 currentLen = forwardedLength.elementAti(end+1);
3111 }
3112
3113
3114 fRXPat->fInitialChars8->init(fRXPat->fInitialChars);
3115
3116
3117 // Sort out what we should check for when looking for candidate match start positions.
3118 // In order of preference,
3119 // 1. Start of input text buffer.
3120 // 2. A literal string.
3121 // 3. Start of line in multi-line mode.
3122 // 4. A single literal character.
3123 // 5. A character from a set of characters.
3124 //
3125 if (fRXPat->fStartType == START_START) {
3126 // Match only at the start of an input text string.
3127 // start type is already set. We're done.
3128 } else if (numInitialStrings == 1 && fRXPat->fMinMatchLen > 0) {
3129 // Match beginning only with a literal string.
3130 UChar32 c = fRXPat->fLiteralText.char32At(fRXPat->fInitialStringIdx);
3131 U_ASSERT(fRXPat->fInitialChars->contains(c));
3132 fRXPat->fStartType = START_STRING;
3133 fRXPat->fInitialChar = c;
3134 } else if (fRXPat->fStartType == START_LINE) {
3135 // Match at start of line in Multi-Line mode.
3136 // Nothing to do here; everything is already set.
3137 } else if (fRXPat->fMinMatchLen == 0) {
3138 // Zero length match possible. We could start anywhere.
3139 fRXPat->fStartType = START_NO_INFO;
3140 } else if (fRXPat->fInitialChars->size() == 1) {
3141 // All matches begin with the same char.
3142 fRXPat->fStartType = START_CHAR;
3143 fRXPat->fInitialChar = fRXPat->fInitialChars->charAt(0);
3144 U_ASSERT(fRXPat->fInitialChar != (UChar32)-1);
3145 } else if (fRXPat->fInitialChars->contains((UChar32)0, (UChar32)0x10ffff) == FALSE &&
3146 fRXPat->fMinMatchLen > 0) {
3147 // Matches start with a set of character smaller than the set of all chars.
3148 fRXPat->fStartType = START_SET;
3149 } else {
3150 // Matches can start with anything
3151 fRXPat->fStartType = START_NO_INFO;
3152 }
3153
3154 return;
3155 }
3156
3157
3158
3159 //------------------------------------------------------------------------------
3160 //
3161 // minMatchLength Calculate the length of the shortest string that could
3162 // match the specified pattern.
3163 // Length is in 16 bit code units, not code points.
3164 //
3165 // The calculated length may not be exact. The returned
3166 // value may be shorter than the actual minimum; it must
3167 // never be longer.
3168 //
3169 // start and end are the range of p-code operations to be
3170 // examined. The endpoints are included in the range.
3171 //
3172 //------------------------------------------------------------------------------
minMatchLength(int32_t start,int32_t end)3173 int32_t RegexCompile::minMatchLength(int32_t start, int32_t end) {
3174 if (U_FAILURE(*fStatus)) {
3175 return 0;
3176 }
3177
3178 U_ASSERT(start <= end);
3179 U_ASSERT(end < fRXPat->fCompiledPat->size());
3180
3181
3182 int32_t loc;
3183 int32_t op;
3184 int32_t opType;
3185 int32_t currentLen = 0;
3186
3187
3188 // forwardedLength is a vector holding minimum-match-length values that
3189 // are propagated forward in the pattern by JMP or STATE_SAVE operations.
3190 // It must be one longer than the pattern being checked because some ops
3191 // will jmp to a end-of-block+1 location from within a block, and we must
3192 // count those when checking the block.
3193 UVector32 forwardedLength(end+2, *fStatus);
3194 forwardedLength.setSize(end+2);
3195 for (loc=start; loc<=end+1; loc++) {
3196 forwardedLength.setElementAt(INT32_MAX, loc);
3197 }
3198
3199 for (loc = start; loc<=end; loc++) {
3200 op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3201 opType = URX_TYPE(op);
3202
3203 // The loop is advancing linearly through the pattern.
3204 // If the op we are now at was the destination of a branch in the pattern,
3205 // and that path has a shorter minimum length than the current accumulated value,
3206 // replace the current accumulated value.
3207 // U_ASSERT(currentLen>=0 && currentLen < INT32_MAX); // MinLength == INT32_MAX for some
3208 // no-match-possible cases.
3209 if (forwardedLength.elementAti(loc) < currentLen) {
3210 currentLen = forwardedLength.elementAti(loc);
3211 U_ASSERT(currentLen>=0 && currentLen < INT32_MAX);
3212 }
3213
3214 switch (opType) {
3215 // Ops that don't change the total length matched
3216 case URX_RESERVED_OP:
3217 case URX_END:
3218 case URX_STRING_LEN:
3219 case URX_NOP:
3220 case URX_START_CAPTURE:
3221 case URX_END_CAPTURE:
3222 case URX_BACKSLASH_B:
3223 case URX_BACKSLASH_BU:
3224 case URX_BACKSLASH_G:
3225 case URX_BACKSLASH_Z:
3226 case URX_CARET:
3227 case URX_DOLLAR:
3228 case URX_DOLLAR_M:
3229 case URX_DOLLAR_D:
3230 case URX_DOLLAR_MD:
3231 case URX_RELOC_OPRND:
3232 case URX_STO_INP_LOC:
3233 case URX_CARET_M:
3234 case URX_CARET_M_UNIX:
3235 case URX_BACKREF: // BackRef. Must assume that it might be a zero length match
3236 case URX_BACKREF_I:
3237
3238 case URX_STO_SP: // Setup for atomic or possessive blocks. Doesn't change what can match.
3239 case URX_LD_SP:
3240
3241 case URX_JMP_SAV:
3242 case URX_JMP_SAV_X:
3243 break;
3244
3245
3246 // Ops that match a minimum of one character (one or two 16 bit code units.)
3247 //
3248 case URX_ONECHAR:
3249 case URX_STATIC_SETREF:
3250 case URX_STAT_SETREF_N:
3251 case URX_SETREF:
3252 case URX_BACKSLASH_D:
3253 case URX_BACKSLASH_H:
3254 case URX_BACKSLASH_R:
3255 case URX_BACKSLASH_V:
3256 case URX_ONECHAR_I:
3257 case URX_BACKSLASH_X: // Grahpeme Cluster. Minimum is 1, max unbounded.
3258 case URX_DOTANY_ALL: // . matches one or two.
3259 case URX_DOTANY:
3260 case URX_DOTANY_UNIX:
3261 currentLen++;
3262 break;
3263
3264
3265 case URX_JMPX:
3266 loc++; // URX_JMPX has an extra operand, ignored here,
3267 // otherwise processed identically to URX_JMP.
3268 U_FALLTHROUGH;
3269 case URX_JMP:
3270 {
3271 int32_t jmpDest = URX_VAL(op);
3272 if (jmpDest < loc) {
3273 // Loop of some kind. Can safely ignore, the worst that will happen
3274 // is that we understate the true minimum length
3275 currentLen = forwardedLength.elementAti(loc+1);
3276 } else {
3277 // Forward jump. Propagate the current min length to the target loc of the jump.
3278 U_ASSERT(jmpDest <= end+1);
3279 if (forwardedLength.elementAti(jmpDest) > currentLen) {
3280 forwardedLength.setElementAt(currentLen, jmpDest);
3281 }
3282 }
3283 }
3284 break;
3285
3286 case URX_BACKTRACK:
3287 {
3288 // Back-tracks are kind of like a branch, except that the min length was
3289 // propagated already, by the state save.
3290 currentLen = forwardedLength.elementAti(loc+1);
3291 }
3292 break;
3293
3294
3295 case URX_STATE_SAVE:
3296 {
3297 // State Save, for forward jumps, propagate the current minimum.
3298 // of the state save.
3299 int32_t jmpDest = URX_VAL(op);
3300 if (jmpDest > loc) {
3301 if (currentLen < forwardedLength.elementAti(jmpDest)) {
3302 forwardedLength.setElementAt(currentLen, jmpDest);
3303 }
3304 }
3305 }
3306 break;
3307
3308
3309 case URX_STRING:
3310 {
3311 loc++;
3312 int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3313 currentLen += URX_VAL(stringLenOp);
3314 }
3315 break;
3316
3317
3318 case URX_STRING_I:
3319 {
3320 loc++;
3321 // TODO: with full case folding, matching input text may be shorter than
3322 // the string we have here. More smarts could put some bounds on it.
3323 // Assume a min length of one for now. A min length of zero causes
3324 // optimization failures for a pattern like "string"+
3325 // currentLen += URX_VAL(stringLenOp);
3326 currentLen += 1;
3327 }
3328 break;
3329
3330 case URX_CTR_INIT:
3331 case URX_CTR_INIT_NG:
3332 {
3333 // Loop Init Ops.
3334 // If the min loop count == 0
3335 // move loc forwards to the end of the loop, skipping over the body.
3336 // If the min count is > 0,
3337 // continue normal processing of the body of the loop.
3338 int32_t loopEndLoc = (int32_t)fRXPat->fCompiledPat->elementAti(loc+1);
3339 loopEndLoc = URX_VAL(loopEndLoc);
3340 int32_t minLoopCount = (int32_t)fRXPat->fCompiledPat->elementAti(loc+2);
3341 if (minLoopCount == 0) {
3342 loc = loopEndLoc;
3343 } else {
3344 loc+=3; // Skips over operands of CTR_INIT
3345 }
3346 }
3347 break;
3348
3349
3350 case URX_CTR_LOOP:
3351 case URX_CTR_LOOP_NG:
3352 // Loop ops.
3353 // The jump is conditional, backwards only.
3354 break;
3355
3356 case URX_LOOP_SR_I:
3357 case URX_LOOP_DOT_I:
3358 case URX_LOOP_C:
3359 // More loop ops. These state-save to themselves.
3360 // don't change the minimum match - could match nothing at all.
3361 break;
3362
3363
3364 case URX_LA_START:
3365 case URX_LB_START:
3366 {
3367 // Look-around. Scan forward until the matching look-ahead end,
3368 // without processing the look-around block. This is overly pessimistic for look-ahead,
3369 // it assumes that the look-ahead match might be zero-length.
3370 // TODO: Positive lookahead could recursively do the block, then continue
3371 // with the longer of the block or the value coming in. Ticket 6060
3372 int32_t depth = (opType == URX_LA_START? 2: 1);;
3373 for (;;) {
3374 loc++;
3375 op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3376 if (URX_TYPE(op) == URX_LA_START) {
3377 // The boilerplate for look-ahead includes two LA_END insturctions,
3378 // Depth will be decremented by each one when it is seen.
3379 depth += 2;
3380 }
3381 if (URX_TYPE(op) == URX_LB_START) {
3382 depth++;
3383 }
3384 if (URX_TYPE(op) == URX_LA_END) {
3385 depth--;
3386 if (depth == 0) {
3387 break;
3388 }
3389 }
3390 if (URX_TYPE(op)==URX_LBN_END) {
3391 depth--;
3392 if (depth == 0) {
3393 break;
3394 }
3395 }
3396 if (URX_TYPE(op) == URX_STATE_SAVE) {
3397 // Need this because neg lookahead blocks will FAIL to outside
3398 // of the block.
3399 int32_t jmpDest = URX_VAL(op);
3400 if (jmpDest > loc) {
3401 if (currentLen < forwardedLength.elementAti(jmpDest)) {
3402 forwardedLength.setElementAt(currentLen, jmpDest);
3403 }
3404 }
3405 }
3406 U_ASSERT(loc <= end);
3407 }
3408 }
3409 break;
3410
3411 case URX_LA_END:
3412 case URX_LB_CONT:
3413 case URX_LB_END:
3414 case URX_LBN_CONT:
3415 case URX_LBN_END:
3416 // Only come here if the matching URX_LA_START or URX_LB_START was not in the
3417 // range being sized, which happens when measuring size of look-behind blocks.
3418 break;
3419
3420 default:
3421 U_ASSERT(FALSE);
3422 }
3423
3424 }
3425
3426 // We have finished walking through the ops. Check whether some forward jump
3427 // propagated a shorter length to location end+1.
3428 if (forwardedLength.elementAti(end+1) < currentLen) {
3429 currentLen = forwardedLength.elementAti(end+1);
3430 U_ASSERT(currentLen>=0 && currentLen < INT32_MAX);
3431 }
3432
3433 return currentLen;
3434 }
3435
3436 // Increment with overflow check.
3437 // val and delta will both be positive.
3438
safeIncrement(int32_t val,int32_t delta)3439 static int32_t safeIncrement(int32_t val, int32_t delta) {
3440 if (INT32_MAX - val > delta) {
3441 return val + delta;
3442 } else {
3443 return INT32_MAX;
3444 }
3445 }
3446
3447
3448 //------------------------------------------------------------------------------
3449 //
3450 // maxMatchLength Calculate the length of the longest string that could
3451 // match the specified pattern.
3452 // Length is in 16 bit code units, not code points.
3453 //
3454 // The calculated length may not be exact. The returned
3455 // value may be longer than the actual maximum; it must
3456 // never be shorter.
3457 //
3458 //------------------------------------------------------------------------------
maxMatchLength(int32_t start,int32_t end)3459 int32_t RegexCompile::maxMatchLength(int32_t start, int32_t end) {
3460 if (U_FAILURE(*fStatus)) {
3461 return 0;
3462 }
3463 U_ASSERT(start <= end);
3464 U_ASSERT(end < fRXPat->fCompiledPat->size());
3465
3466
3467 int32_t loc;
3468 int32_t op;
3469 int32_t opType;
3470 int32_t currentLen = 0;
3471 UVector32 forwardedLength(end+1, *fStatus);
3472 forwardedLength.setSize(end+1);
3473
3474 for (loc=start; loc<=end; loc++) {
3475 forwardedLength.setElementAt(0, loc);
3476 }
3477
3478 for (loc = start; loc<=end; loc++) {
3479 op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3480 opType = URX_TYPE(op);
3481
3482 // The loop is advancing linearly through the pattern.
3483 // If the op we are now at was the destination of a branch in the pattern,
3484 // and that path has a longer maximum length than the current accumulated value,
3485 // replace the current accumulated value.
3486 if (forwardedLength.elementAti(loc) > currentLen) {
3487 currentLen = forwardedLength.elementAti(loc);
3488 }
3489
3490 switch (opType) {
3491 // Ops that don't change the total length matched
3492 case URX_RESERVED_OP:
3493 case URX_END:
3494 case URX_STRING_LEN:
3495 case URX_NOP:
3496 case URX_START_CAPTURE:
3497 case URX_END_CAPTURE:
3498 case URX_BACKSLASH_B:
3499 case URX_BACKSLASH_BU:
3500 case URX_BACKSLASH_G:
3501 case URX_BACKSLASH_Z:
3502 case URX_CARET:
3503 case URX_DOLLAR:
3504 case URX_DOLLAR_M:
3505 case URX_DOLLAR_D:
3506 case URX_DOLLAR_MD:
3507 case URX_RELOC_OPRND:
3508 case URX_STO_INP_LOC:
3509 case URX_CARET_M:
3510 case URX_CARET_M_UNIX:
3511
3512 case URX_STO_SP: // Setup for atomic or possessive blocks. Doesn't change what can match.
3513 case URX_LD_SP:
3514
3515 case URX_LB_END:
3516 case URX_LB_CONT:
3517 case URX_LBN_CONT:
3518 case URX_LBN_END:
3519 break;
3520
3521
3522 // Ops that increase that cause an unbounded increase in the length
3523 // of a matched string, or that increase it a hard to characterize way.
3524 // Call the max length unbounded, and stop further checking.
3525 case URX_BACKREF: // BackRef. Must assume that it might be a zero length match
3526 case URX_BACKREF_I:
3527 case URX_BACKSLASH_X: // Grahpeme Cluster. Minimum is 1, max unbounded.
3528 currentLen = INT32_MAX;
3529 break;
3530
3531
3532 // Ops that match a max of one character (possibly two 16 bit code units.)
3533 //
3534 case URX_STATIC_SETREF:
3535 case URX_STAT_SETREF_N:
3536 case URX_SETREF:
3537 case URX_BACKSLASH_D:
3538 case URX_BACKSLASH_H:
3539 case URX_BACKSLASH_R:
3540 case URX_BACKSLASH_V:
3541 case URX_ONECHAR_I:
3542 case URX_DOTANY_ALL:
3543 case URX_DOTANY:
3544 case URX_DOTANY_UNIX:
3545 currentLen = safeIncrement(currentLen, 2);
3546 break;
3547
3548 // Single literal character. Increase current max length by one or two,
3549 // depending on whether the char is in the supplementary range.
3550 case URX_ONECHAR:
3551 currentLen = safeIncrement(currentLen, 1);
3552 if (URX_VAL(op) > 0x10000) {
3553 currentLen = safeIncrement(currentLen, 1);
3554 }
3555 break;
3556
3557 // Jumps.
3558 //
3559 case URX_JMP:
3560 case URX_JMPX:
3561 case URX_JMP_SAV:
3562 case URX_JMP_SAV_X:
3563 {
3564 int32_t jmpDest = URX_VAL(op);
3565 if (jmpDest < loc) {
3566 // Loop of some kind. Max match length is unbounded.
3567 currentLen = INT32_MAX;
3568 } else {
3569 // Forward jump. Propagate the current min length to the target loc of the jump.
3570 if (forwardedLength.elementAti(jmpDest) < currentLen) {
3571 forwardedLength.setElementAt(currentLen, jmpDest);
3572 }
3573 currentLen = 0;
3574 }
3575 }
3576 break;
3577
3578 case URX_BACKTRACK:
3579 // back-tracks are kind of like a branch, except that the max length was
3580 // propagated already, by the state save.
3581 currentLen = forwardedLength.elementAti(loc+1);
3582 break;
3583
3584
3585 case URX_STATE_SAVE:
3586 {
3587 // State Save, for forward jumps, propagate the current minimum.
3588 // of the state save.
3589 // For backwards jumps, they create a loop, maximum
3590 // match length is unbounded.
3591 int32_t jmpDest = URX_VAL(op);
3592 if (jmpDest > loc) {
3593 if (currentLen > forwardedLength.elementAti(jmpDest)) {
3594 forwardedLength.setElementAt(currentLen, jmpDest);
3595 }
3596 } else {
3597 currentLen = INT32_MAX;
3598 }
3599 }
3600 break;
3601
3602
3603
3604
3605 case URX_STRING:
3606 {
3607 loc++;
3608 int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3609 currentLen = safeIncrement(currentLen, URX_VAL(stringLenOp));
3610 break;
3611 }
3612
3613 case URX_STRING_I:
3614 // TODO: This code assumes that any user string that matches will be no longer
3615 // than our compiled string, with case insensitive matching.
3616 // Our compiled string has been case-folded already.
3617 //
3618 // Any matching user string will have no more code points than our
3619 // compiled (folded) string. Folding may add code points, but
3620 // not remove them.
3621 //
3622 // There is a potential problem if a supplemental code point
3623 // case-folds to a BMP code point. In this case our compiled string
3624 // could be shorter (in code units) than a matching user string.
3625 //
3626 // At this time (Unicode 6.1) there are no such characters, and this case
3627 // is not being handled. A test, intltest regex/Bug9283, will fail if
3628 // any problematic characters are added to Unicode.
3629 //
3630 // If this happens, we can make a set of the BMP chars that the
3631 // troublesome supplementals fold to, scan our string, and bump the
3632 // currentLen one extra for each that is found.
3633 //
3634 {
3635 loc++;
3636 int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3637 currentLen = safeIncrement(currentLen, URX_VAL(stringLenOp));
3638 }
3639 break;
3640
3641 case URX_CTR_INIT:
3642 case URX_CTR_INIT_NG:
3643 // For Loops, recursively call this function on the pattern for the loop body,
3644 // then multiply the result by the maximum loop count.
3645 {
3646 int32_t loopEndLoc = URX_VAL(fRXPat->fCompiledPat->elementAti(loc+1));
3647 if (loopEndLoc == loc+4) {
3648 // Loop has an empty body. No affect on max match length.
3649 // Continue processing with code after the loop end.
3650 loc = loopEndLoc;
3651 break;
3652 }
3653
3654 int32_t maxLoopCount = static_cast<int32_t>(fRXPat->fCompiledPat->elementAti(loc+3));
3655 if (maxLoopCount == -1) {
3656 // Unbounded Loop. No upper bound on match length.
3657 currentLen = INT32_MAX;
3658 break;
3659 }
3660
3661 U_ASSERT(loopEndLoc >= loc+4);
3662 int64_t blockLen = maxMatchLength(loc+4, loopEndLoc-1); // Recursive call.
3663 int64_t updatedLen = (int64_t)currentLen + blockLen * maxLoopCount;
3664 if (updatedLen >= INT32_MAX) {
3665 currentLen = INT32_MAX;
3666 break;
3667 }
3668 currentLen = (int32_t)updatedLen;
3669 loc = loopEndLoc;
3670 break;
3671 }
3672
3673 case URX_CTR_LOOP:
3674 case URX_CTR_LOOP_NG:
3675 // These opcodes will be skipped over by code for URX_CRT_INIT.
3676 // We shouldn't encounter them here.
3677 U_ASSERT(FALSE);
3678 break;
3679
3680 case URX_LOOP_SR_I:
3681 case URX_LOOP_DOT_I:
3682 case URX_LOOP_C:
3683 // For anything to do with loops, make the match length unbounded.
3684 currentLen = INT32_MAX;
3685 break;
3686
3687
3688
3689 case URX_LA_START:
3690 case URX_LA_END:
3691 // Look-ahead. Just ignore, treat the look-ahead block as if
3692 // it were normal pattern. Gives a too-long match length,
3693 // but good enough for now.
3694 break;
3695
3696 // End of look-ahead ops should always be consumed by the processing at
3697 // the URX_LA_START op.
3698 // U_ASSERT(FALSE);
3699 // break;
3700
3701 case URX_LB_START:
3702 {
3703 // Look-behind. Scan forward until the matching look-around end,
3704 // without processing the look-behind block.
3705 int32_t depth = 0;
3706 for (;;) {
3707 loc++;
3708 op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3709 if (URX_TYPE(op) == URX_LA_START || URX_TYPE(op) == URX_LB_START) {
3710 depth++;
3711 }
3712 if (URX_TYPE(op) == URX_LA_END || URX_TYPE(op)==URX_LBN_END) {
3713 if (depth == 0) {
3714 break;
3715 }
3716 depth--;
3717 }
3718 U_ASSERT(loc < end);
3719 }
3720 }
3721 break;
3722
3723 default:
3724 U_ASSERT(FALSE);
3725 }
3726
3727
3728 if (currentLen == INT32_MAX) {
3729 // The maximum length is unbounded.
3730 // Stop further processing of the pattern.
3731 break;
3732 }
3733
3734 }
3735 return currentLen;
3736
3737 }
3738
3739
3740 //------------------------------------------------------------------------------
3741 //
3742 // stripNOPs Remove any NOP operations from the compiled pattern code.
3743 // Extra NOPs are inserted for some constructs during the initial
3744 // code generation to provide locations that may be patched later.
3745 // Many end up unneeded, and are removed by this function.
3746 //
3747 // In order to minimize the number of passes through the pattern,
3748 // back-reference fixup is also performed here (adjusting
3749 // back-reference operands to point to the correct frame offsets).
3750 //
3751 //------------------------------------------------------------------------------
stripNOPs()3752 void RegexCompile::stripNOPs() {
3753
3754 if (U_FAILURE(*fStatus)) {
3755 return;
3756 }
3757
3758 int32_t end = fRXPat->fCompiledPat->size();
3759 UVector32 deltas(end, *fStatus);
3760
3761 // Make a first pass over the code, computing the amount that things
3762 // will be offset at each location in the original code.
3763 int32_t loc;
3764 int32_t d = 0;
3765 for (loc=0; loc<end; loc++) {
3766 deltas.addElement(d, *fStatus);
3767 int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3768 if (URX_TYPE(op) == URX_NOP) {
3769 d++;
3770 }
3771 }
3772
3773 UnicodeString caseStringBuffer;
3774
3775 // Make a second pass over the code, removing the NOPs by moving following
3776 // code up, and patching operands that refer to code locations that
3777 // are being moved. The array of offsets from the first step is used
3778 // to compute the new operand values.
3779 int32_t src;
3780 int32_t dst = 0;
3781 for (src=0; src<end; src++) {
3782 int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(src);
3783 int32_t opType = URX_TYPE(op);
3784 switch (opType) {
3785 case URX_NOP:
3786 break;
3787
3788 case URX_STATE_SAVE:
3789 case URX_JMP:
3790 case URX_CTR_LOOP:
3791 case URX_CTR_LOOP_NG:
3792 case URX_RELOC_OPRND:
3793 case URX_JMPX:
3794 case URX_JMP_SAV:
3795 case URX_JMP_SAV_X:
3796 // These are instructions with operands that refer to code locations.
3797 {
3798 int32_t operandAddress = URX_VAL(op);
3799 U_ASSERT(operandAddress>=0 && operandAddress<deltas.size());
3800 int32_t fixedOperandAddress = operandAddress - deltas.elementAti(operandAddress);
3801 op = buildOp(opType, fixedOperandAddress);
3802 fRXPat->fCompiledPat->setElementAt(op, dst);
3803 dst++;
3804 break;
3805 }
3806
3807 case URX_BACKREF:
3808 case URX_BACKREF_I:
3809 {
3810 int32_t where = URX_VAL(op);
3811 if (where > fRXPat->fGroupMap->size()) {
3812 error(U_REGEX_INVALID_BACK_REF);
3813 break;
3814 }
3815 where = fRXPat->fGroupMap->elementAti(where-1);
3816 op = buildOp(opType, where);
3817 fRXPat->fCompiledPat->setElementAt(op, dst);
3818 dst++;
3819
3820 fRXPat->fNeedsAltInput = TRUE;
3821 break;
3822 }
3823 case URX_RESERVED_OP:
3824 case URX_RESERVED_OP_N:
3825 case URX_BACKTRACK:
3826 case URX_END:
3827 case URX_ONECHAR:
3828 case URX_STRING:
3829 case URX_STRING_LEN:
3830 case URX_START_CAPTURE:
3831 case URX_END_CAPTURE:
3832 case URX_STATIC_SETREF:
3833 case URX_STAT_SETREF_N:
3834 case URX_SETREF:
3835 case URX_DOTANY:
3836 case URX_FAIL:
3837 case URX_BACKSLASH_B:
3838 case URX_BACKSLASH_BU:
3839 case URX_BACKSLASH_G:
3840 case URX_BACKSLASH_X:
3841 case URX_BACKSLASH_Z:
3842 case URX_DOTANY_ALL:
3843 case URX_BACKSLASH_D:
3844 case URX_CARET:
3845 case URX_DOLLAR:
3846 case URX_CTR_INIT:
3847 case URX_CTR_INIT_NG:
3848 case URX_DOTANY_UNIX:
3849 case URX_STO_SP:
3850 case URX_LD_SP:
3851 case URX_STO_INP_LOC:
3852 case URX_LA_START:
3853 case URX_LA_END:
3854 case URX_ONECHAR_I:
3855 case URX_STRING_I:
3856 case URX_DOLLAR_M:
3857 case URX_CARET_M:
3858 case URX_CARET_M_UNIX:
3859 case URX_LB_START:
3860 case URX_LB_CONT:
3861 case URX_LB_END:
3862 case URX_LBN_CONT:
3863 case URX_LBN_END:
3864 case URX_LOOP_SR_I:
3865 case URX_LOOP_DOT_I:
3866 case URX_LOOP_C:
3867 case URX_DOLLAR_D:
3868 case URX_DOLLAR_MD:
3869 case URX_BACKSLASH_H:
3870 case URX_BACKSLASH_R:
3871 case URX_BACKSLASH_V:
3872 // These instructions are unaltered by the relocation.
3873 fRXPat->fCompiledPat->setElementAt(op, dst);
3874 dst++;
3875 break;
3876
3877 default:
3878 // Some op is unaccounted for.
3879 U_ASSERT(FALSE);
3880 error(U_REGEX_INTERNAL_ERROR);
3881 }
3882 }
3883
3884 fRXPat->fCompiledPat->setSize(dst);
3885 }
3886
3887
3888
3889
3890 //------------------------------------------------------------------------------
3891 //
3892 // Error Report a rule parse error.
3893 // Only report it if no previous error has been recorded.
3894 //
3895 //------------------------------------------------------------------------------
error(UErrorCode e)3896 void RegexCompile::error(UErrorCode e) {
3897 if (U_SUCCESS(*fStatus)) {
3898 *fStatus = e;
3899 // Hmm. fParseErr (UParseError) line & offset fields are int32_t in public
3900 // API (see common/unicode/parseerr.h), while fLineNum and fCharNum are
3901 // int64_t. If the values of the latter are out of range for the former,
3902 // set them to the appropriate "field not supported" values.
3903 if (fLineNum > 0x7FFFFFFF) {
3904 fParseErr->line = 0;
3905 fParseErr->offset = -1;
3906 } else if (fCharNum > 0x7FFFFFFF) {
3907 fParseErr->line = (int32_t)fLineNum;
3908 fParseErr->offset = -1;
3909 } else {
3910 fParseErr->line = (int32_t)fLineNum;
3911 fParseErr->offset = (int32_t)fCharNum;
3912 }
3913
3914 UErrorCode status = U_ZERO_ERROR; // throwaway status for extracting context
3915
3916 // Fill in the context.
3917 // Note: extractBetween() pins supplied indicies to the string bounds.
3918 uprv_memset(fParseErr->preContext, 0, sizeof(fParseErr->preContext));
3919 uprv_memset(fParseErr->postContext, 0, sizeof(fParseErr->postContext));
3920 utext_extract(fRXPat->fPattern, fScanIndex-U_PARSE_CONTEXT_LEN+1, fScanIndex, fParseErr->preContext, U_PARSE_CONTEXT_LEN, &status);
3921 utext_extract(fRXPat->fPattern, fScanIndex, fScanIndex+U_PARSE_CONTEXT_LEN-1, fParseErr->postContext, U_PARSE_CONTEXT_LEN, &status);
3922 }
3923 }
3924
3925
3926 //
3927 // Assorted Unicode character constants.
3928 // Numeric because there is no portable way to enter them as literals.
3929 // (Think EBCDIC).
3930 //
3931 static const UChar chCR = 0x0d; // New lines, for terminating comments.
3932 static const UChar chLF = 0x0a; // Line Feed
3933 static const UChar chPound = 0x23; // '#', introduces a comment.
3934 static const UChar chDigit0 = 0x30; // '0'
3935 static const UChar chDigit7 = 0x37; // '9'
3936 static const UChar chColon = 0x3A; // ':'
3937 static const UChar chE = 0x45; // 'E'
3938 static const UChar chQ = 0x51; // 'Q'
3939 //static const UChar chN = 0x4E; // 'N'
3940 static const UChar chP = 0x50; // 'P'
3941 static const UChar chBackSlash = 0x5c; // '\' introduces a char escape
3942 //static const UChar chLBracket = 0x5b; // '['
3943 static const UChar chRBracket = 0x5d; // ']'
3944 static const UChar chUp = 0x5e; // '^'
3945 static const UChar chLowerP = 0x70;
3946 static const UChar chLBrace = 0x7b; // '{'
3947 static const UChar chRBrace = 0x7d; // '}'
3948 static const UChar chNEL = 0x85; // NEL newline variant
3949 static const UChar chLS = 0x2028; // Unicode Line Separator
3950
3951
3952 //------------------------------------------------------------------------------
3953 //
3954 // nextCharLL Low Level Next Char from the regex pattern.
3955 // Get a char from the string, keep track of input position
3956 // for error reporting.
3957 //
3958 //------------------------------------------------------------------------------
nextCharLL()3959 UChar32 RegexCompile::nextCharLL() {
3960 UChar32 ch;
3961
3962 if (fPeekChar != -1) {
3963 ch = fPeekChar;
3964 fPeekChar = -1;
3965 return ch;
3966 }
3967
3968 // assume we're already in the right place
3969 ch = UTEXT_NEXT32(fRXPat->fPattern);
3970 if (ch == U_SENTINEL) {
3971 return ch;
3972 }
3973
3974 if (ch == chCR ||
3975 ch == chNEL ||
3976 ch == chLS ||
3977 (ch == chLF && fLastChar != chCR)) {
3978 // Character is starting a new line. Bump up the line number, and
3979 // reset the column to 0.
3980 fLineNum++;
3981 fCharNum=0;
3982 }
3983 else {
3984 // Character is not starting a new line. Except in the case of a
3985 // LF following a CR, increment the column position.
3986 if (ch != chLF) {
3987 fCharNum++;
3988 }
3989 }
3990 fLastChar = ch;
3991 return ch;
3992 }
3993
3994 //------------------------------------------------------------------------------
3995 //
3996 // peekCharLL Low Level Character Scanning, sneak a peek at the next
3997 // character without actually getting it.
3998 //
3999 //------------------------------------------------------------------------------
peekCharLL()4000 UChar32 RegexCompile::peekCharLL() {
4001 if (fPeekChar == -1) {
4002 fPeekChar = nextCharLL();
4003 }
4004 return fPeekChar;
4005 }
4006
4007
4008 //------------------------------------------------------------------------------
4009 //
4010 // nextChar for pattern scanning. At this level, we handle stripping
4011 // out comments and processing some backslash character escapes.
4012 // The rest of the pattern grammar is handled at the next level up.
4013 //
4014 //------------------------------------------------------------------------------
nextChar(RegexPatternChar & c)4015 void RegexCompile::nextChar(RegexPatternChar &c) {
4016
4017 fScanIndex = UTEXT_GETNATIVEINDEX(fRXPat->fPattern);
4018 c.fChar = nextCharLL();
4019 c.fQuoted = FALSE;
4020
4021 if (fQuoteMode) {
4022 c.fQuoted = TRUE;
4023 if ((c.fChar==chBackSlash && peekCharLL()==chE && ((fModeFlags & UREGEX_LITERAL) == 0)) ||
4024 c.fChar == (UChar32)-1) {
4025 fQuoteMode = FALSE; // Exit quote mode,
4026 nextCharLL(); // discard the E
4027 nextChar(c); // recurse to get the real next char
4028 }
4029 }
4030 else if (fInBackslashQuote) {
4031 // The current character immediately follows a '\'
4032 // Don't check for any further escapes, just return it as-is.
4033 // Don't set c.fQuoted, because that would prevent the state machine from
4034 // dispatching on the character.
4035 fInBackslashQuote = FALSE;
4036 }
4037 else
4038 {
4039 // We are not in a \Q quoted region \E of the source.
4040 //
4041 if (fModeFlags & UREGEX_COMMENTS) {
4042 //
4043 // We are in free-spacing and comments mode.
4044 // Scan through any white space and comments, until we
4045 // reach a significant character or the end of inut.
4046 for (;;) {
4047 if (c.fChar == (UChar32)-1) {
4048 break; // End of Input
4049 }
4050 if (c.fChar == chPound && fEOLComments == TRUE) {
4051 // Start of a comment. Consume the rest of it, until EOF or a new line
4052 for (;;) {
4053 c.fChar = nextCharLL();
4054 if (c.fChar == (UChar32)-1 || // EOF
4055 c.fChar == chCR ||
4056 c.fChar == chLF ||
4057 c.fChar == chNEL ||
4058 c.fChar == chLS) {
4059 break;
4060 }
4061 }
4062 }
4063 // TODO: check what Java & Perl do with non-ASCII white spaces. Ticket 6061.
4064 if (PatternProps::isWhiteSpace(c.fChar) == FALSE) {
4065 break;
4066 }
4067 c.fChar = nextCharLL();
4068 }
4069 }
4070
4071 //
4072 // check for backslash escaped characters.
4073 //
4074 if (c.fChar == chBackSlash) {
4075 int64_t pos = UTEXT_GETNATIVEINDEX(fRXPat->fPattern);
4076 if (RegexStaticSets::gStaticSets->fUnescapeCharSet.contains(peekCharLL())) {
4077 //
4078 // A '\' sequence that is handled by ICU's standard unescapeAt function.
4079 // Includes \uxxxx, \n, \r, many others.
4080 // Return the single equivalent character.
4081 //
4082 nextCharLL(); // get & discard the peeked char.
4083 c.fQuoted = TRUE;
4084
4085 if (UTEXT_FULL_TEXT_IN_CHUNK(fRXPat->fPattern, fPatternLength)) {
4086 int32_t endIndex = (int32_t)pos;
4087 c.fChar = u_unescapeAt(uregex_ucstr_unescape_charAt, &endIndex, (int32_t)fPatternLength, (void *)fRXPat->fPattern->chunkContents);
4088
4089 if (endIndex == pos) {
4090 error(U_REGEX_BAD_ESCAPE_SEQUENCE);
4091 }
4092 fCharNum += endIndex - pos;
4093 UTEXT_SETNATIVEINDEX(fRXPat->fPattern, endIndex);
4094 } else {
4095 int32_t offset = 0;
4096 struct URegexUTextUnescapeCharContext context = U_REGEX_UTEXT_UNESCAPE_CONTEXT(fRXPat->fPattern);
4097
4098 UTEXT_SETNATIVEINDEX(fRXPat->fPattern, pos);
4099 c.fChar = u_unescapeAt(uregex_utext_unescape_charAt, &offset, INT32_MAX, &context);
4100
4101 if (offset == 0) {
4102 error(U_REGEX_BAD_ESCAPE_SEQUENCE);
4103 } else if (context.lastOffset == offset) {
4104 UTEXT_PREVIOUS32(fRXPat->fPattern);
4105 } else if (context.lastOffset != offset-1) {
4106 utext_moveIndex32(fRXPat->fPattern, offset - context.lastOffset - 1);
4107 }
4108 fCharNum += offset;
4109 }
4110 }
4111 else if (peekCharLL() == chDigit0) {
4112 // Octal Escape, using Java Regexp Conventions
4113 // which are \0 followed by 1-3 octal digits.
4114 // Different from ICU Unescape handling of Octal, which does not
4115 // require the leading 0.
4116 // Java also has the convention of only consuming 2 octal digits if
4117 // the three digit number would be > 0xff
4118 //
4119 c.fChar = 0;
4120 nextCharLL(); // Consume the initial 0.
4121 int index;
4122 for (index=0; index<3; index++) {
4123 int32_t ch = peekCharLL();
4124 if (ch<chDigit0 || ch>chDigit7) {
4125 if (index==0) {
4126 // \0 is not followed by any octal digits.
4127 error(U_REGEX_BAD_ESCAPE_SEQUENCE);
4128 }
4129 break;
4130 }
4131 c.fChar <<= 3;
4132 c.fChar += ch&7;
4133 if (c.fChar <= 255) {
4134 nextCharLL();
4135 } else {
4136 // The last digit made the number too big. Forget we saw it.
4137 c.fChar >>= 3;
4138 }
4139 }
4140 c.fQuoted = TRUE;
4141 }
4142 else if (peekCharLL() == chQ) {
4143 // "\Q" enter quote mode, which will continue until "\E"
4144 fQuoteMode = TRUE;
4145 nextCharLL(); // discard the 'Q'.
4146 nextChar(c); // recurse to get the real next char.
4147 }
4148 else
4149 {
4150 // We are in a '\' escape that will be handled by the state table scanner.
4151 // Just return the backslash, but remember that the following char is to
4152 // be taken literally.
4153 fInBackslashQuote = TRUE;
4154 }
4155 }
4156 }
4157
4158 // re-enable # to end-of-line comments, in case they were disabled.
4159 // They are disabled by the parser upon seeing '(?', but this lasts for
4160 // the fetching of the next character only.
4161 fEOLComments = TRUE;
4162
4163 // putc(c.fChar, stdout);
4164 }
4165
4166
4167
4168 //------------------------------------------------------------------------------
4169 //
4170 // scanNamedChar
4171 // Get a UChar32 from a \N{UNICODE CHARACTER NAME} in the pattern.
4172 //
4173 // The scan position will be at the 'N'. On return
4174 // the scan position should be just after the '}'
4175 //
4176 // Return the UChar32
4177 //
4178 //------------------------------------------------------------------------------
scanNamedChar()4179 UChar32 RegexCompile::scanNamedChar() {
4180 if (U_FAILURE(*fStatus)) {
4181 return 0;
4182 }
4183
4184 nextChar(fC);
4185 if (fC.fChar != chLBrace) {
4186 error(U_REGEX_PROPERTY_SYNTAX);
4187 return 0;
4188 }
4189
4190 UnicodeString charName;
4191 for (;;) {
4192 nextChar(fC);
4193 if (fC.fChar == chRBrace) {
4194 break;
4195 }
4196 if (fC.fChar == -1) {
4197 error(U_REGEX_PROPERTY_SYNTAX);
4198 return 0;
4199 }
4200 charName.append(fC.fChar);
4201 }
4202
4203 char name[100];
4204 if (!uprv_isInvariantUString(charName.getBuffer(), charName.length()) ||
4205 (uint32_t)charName.length()>=sizeof(name)) {
4206 // All Unicode character names have only invariant characters.
4207 // The API to get a character, given a name, accepts only char *, forcing us to convert,
4208 // which requires this error check
4209 error(U_REGEX_PROPERTY_SYNTAX);
4210 return 0;
4211 }
4212 charName.extract(0, charName.length(), name, sizeof(name), US_INV);
4213
4214 UChar32 theChar = u_charFromName(U_UNICODE_CHAR_NAME, name, fStatus);
4215 if (U_FAILURE(*fStatus)) {
4216 error(U_REGEX_PROPERTY_SYNTAX);
4217 }
4218
4219 nextChar(fC); // Continue overall regex pattern processing with char after the '}'
4220 return theChar;
4221 }
4222
4223 //------------------------------------------------------------------------------
4224 //
4225 // scanProp Construct a UnicodeSet from the text at the current scan
4226 // position, which will be of the form \p{whaterver}
4227 //
4228 // The scan position will be at the 'p' or 'P'. On return
4229 // the scan position should be just after the '}'
4230 //
4231 // Return a UnicodeSet, constructed from the \P pattern,
4232 // or NULL if the pattern is invalid.
4233 //
4234 //------------------------------------------------------------------------------
scanProp()4235 UnicodeSet *RegexCompile::scanProp() {
4236 UnicodeSet *uset = NULL;
4237
4238 if (U_FAILURE(*fStatus)) {
4239 return NULL;
4240 }
4241 (void)chLowerP; // Suppress compiler unused variable warning.
4242 U_ASSERT(fC.fChar == chLowerP || fC.fChar == chP);
4243 UBool negated = (fC.fChar == chP);
4244
4245 UnicodeString propertyName;
4246 nextChar(fC);
4247 if (fC.fChar != chLBrace) {
4248 error(U_REGEX_PROPERTY_SYNTAX);
4249 return NULL;
4250 }
4251 for (;;) {
4252 nextChar(fC);
4253 if (fC.fChar == chRBrace) {
4254 break;
4255 }
4256 if (fC.fChar == -1) {
4257 // Hit the end of the input string without finding the closing '}'
4258 error(U_REGEX_PROPERTY_SYNTAX);
4259 return NULL;
4260 }
4261 propertyName.append(fC.fChar);
4262 }
4263 uset = createSetForProperty(propertyName, negated);
4264 nextChar(fC); // Move input scan to position following the closing '}'
4265 return uset;
4266 }
4267
4268 //------------------------------------------------------------------------------
4269 //
4270 // scanPosixProp Construct a UnicodeSet from the text at the current scan
4271 // position, which is expected be of the form [:property expression:]
4272 //
4273 // The scan position will be at the opening ':'. On return
4274 // the scan position must be on the closing ']'
4275 //
4276 // Return a UnicodeSet constructed from the pattern,
4277 // or NULL if this is not a valid POSIX-style set expression.
4278 // If not a property expression, restore the initial scan position
4279 // (to the opening ':')
4280 //
4281 // Note: the opening '[:' is not sufficient to guarantee that
4282 // this is a [:property:] expression.
4283 // [:'+=,] is a perfectly good ordinary set expression that
4284 // happens to include ':' as one of its characters.
4285 //
4286 //------------------------------------------------------------------------------
scanPosixProp()4287 UnicodeSet *RegexCompile::scanPosixProp() {
4288 UnicodeSet *uset = NULL;
4289
4290 if (U_FAILURE(*fStatus)) {
4291 return NULL;
4292 }
4293
4294 U_ASSERT(fC.fChar == chColon);
4295
4296 // Save the scanner state.
4297 // TODO: move this into the scanner, with the state encapsulated in some way. Ticket 6062
4298 int64_t savedScanIndex = fScanIndex;
4299 int64_t savedNextIndex = UTEXT_GETNATIVEINDEX(fRXPat->fPattern);
4300 UBool savedQuoteMode = fQuoteMode;
4301 UBool savedInBackslashQuote = fInBackslashQuote;
4302 UBool savedEOLComments = fEOLComments;
4303 int64_t savedLineNum = fLineNum;
4304 int64_t savedCharNum = fCharNum;
4305 UChar32 savedLastChar = fLastChar;
4306 UChar32 savedPeekChar = fPeekChar;
4307 RegexPatternChar savedfC = fC;
4308
4309 // Scan for a closing ]. A little tricky because there are some perverse
4310 // edge cases possible. "[:abc\Qdef:] \E]" is a valid non-property expression,
4311 // ending on the second closing ].
4312
4313 UnicodeString propName;
4314 UBool negated = FALSE;
4315
4316 // Check for and consume the '^' in a negated POSIX property, e.g. [:^Letter:]
4317 nextChar(fC);
4318 if (fC.fChar == chUp) {
4319 negated = TRUE;
4320 nextChar(fC);
4321 }
4322
4323 // Scan for the closing ":]", collecting the property name along the way.
4324 UBool sawPropSetTerminator = FALSE;
4325 for (;;) {
4326 propName.append(fC.fChar);
4327 nextChar(fC);
4328 if (fC.fQuoted || fC.fChar == -1) {
4329 // Escaped characters or end of input - either says this isn't a [:Property:]
4330 break;
4331 }
4332 if (fC.fChar == chColon) {
4333 nextChar(fC);
4334 if (fC.fChar == chRBracket) {
4335 sawPropSetTerminator = TRUE;
4336 }
4337 break;
4338 }
4339 }
4340
4341 if (sawPropSetTerminator) {
4342 uset = createSetForProperty(propName, negated);
4343 }
4344 else
4345 {
4346 // No closing ":]".
4347 // Restore the original scan position.
4348 // The main scanner will retry the input as a normal set expression,
4349 // not a [:Property:] expression.
4350 fScanIndex = savedScanIndex;
4351 fQuoteMode = savedQuoteMode;
4352 fInBackslashQuote = savedInBackslashQuote;
4353 fEOLComments = savedEOLComments;
4354 fLineNum = savedLineNum;
4355 fCharNum = savedCharNum;
4356 fLastChar = savedLastChar;
4357 fPeekChar = savedPeekChar;
4358 fC = savedfC;
4359 UTEXT_SETNATIVEINDEX(fRXPat->fPattern, savedNextIndex);
4360 }
4361 return uset;
4362 }
4363
addIdentifierIgnorable(UnicodeSet * set,UErrorCode & ec)4364 static inline void addIdentifierIgnorable(UnicodeSet *set, UErrorCode& ec) {
4365 set->add(0, 8).add(0x0e, 0x1b).add(0x7f, 0x9f);
4366 addCategory(set, U_GC_CF_MASK, ec);
4367 }
4368
4369 //
4370 // Create a Unicode Set from a Unicode Property expression.
4371 // This is common code underlying both \p{...} ane [:...:] expressions.
4372 // Includes trying the Java "properties" that aren't supported as
4373 // normal ICU UnicodeSet properties
4374 //
4375 static const UChar posSetPrefix[] = {0x5b, 0x5c, 0x70, 0x7b, 0}; // "[\p{"
4376 static const UChar negSetPrefix[] = {0x5b, 0x5c, 0x50, 0x7b, 0}; // "[\P{"
createSetForProperty(const UnicodeString & propName,UBool negated)4377 UnicodeSet *RegexCompile::createSetForProperty(const UnicodeString &propName, UBool negated) {
4378 UnicodeString setExpr;
4379 UnicodeSet *set;
4380 uint32_t usetFlags = 0;
4381
4382 if (U_FAILURE(*fStatus)) {
4383 return NULL;
4384 }
4385
4386 //
4387 // First try the property as we received it
4388 //
4389 if (negated) {
4390 setExpr.append(negSetPrefix, -1);
4391 } else {
4392 setExpr.append(posSetPrefix, -1);
4393 }
4394 setExpr.append(propName);
4395 setExpr.append(chRBrace);
4396 setExpr.append(chRBracket);
4397 if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
4398 usetFlags |= USET_CASE_INSENSITIVE;
4399 }
4400 set = new UnicodeSet(setExpr, usetFlags, NULL, *fStatus);
4401 if (U_SUCCESS(*fStatus)) {
4402 return set;
4403 }
4404 delete set;
4405 set = NULL;
4406
4407 //
4408 // The property as it was didn't work.
4409
4410 // Do [:word:]. It is not recognized as a property by UnicodeSet. "word" not standard POSIX
4411 // or standard Java, but many other regular expression packages do recognize it.
4412
4413 if (propName.caseCompare(UNICODE_STRING_SIMPLE("word"), 0) == 0) {
4414 *fStatus = U_ZERO_ERROR;
4415 set = new UnicodeSet(*(fRXPat->fStaticSets[URX_ISWORD_SET]));
4416 if (set == NULL) {
4417 *fStatus = U_MEMORY_ALLOCATION_ERROR;
4418 return set;
4419 }
4420 if (negated) {
4421 set->complement();
4422 }
4423 return set;
4424 }
4425
4426
4427 // Do Java fixes -
4428 // InGreek -> InGreek or Coptic, that being the official Unicode name for that block.
4429 // InCombiningMarksforSymbols -> InCombiningDiacriticalMarksforSymbols.
4430 //
4431 // Note on Spaces: either "InCombiningMarksForSymbols" or "InCombining Marks for Symbols"
4432 // is accepted by Java. The property part of the name is compared
4433 // case-insenstively. The spaces must be exactly as shown, either
4434 // all there, or all omitted, with exactly one at each position
4435 // if they are present. From checking against JDK 1.6
4436 //
4437 // This code should be removed when ICU properties support the Java compatibility names
4438 // (ICU 4.0?)
4439 //
4440 UnicodeString mPropName = propName;
4441 if (mPropName.caseCompare(UNICODE_STRING_SIMPLE("InGreek"), 0) == 0) {
4442 mPropName = UNICODE_STRING_SIMPLE("InGreek and Coptic");
4443 }
4444 if (mPropName.caseCompare(UNICODE_STRING_SIMPLE("InCombining Marks for Symbols"), 0) == 0 ||
4445 mPropName.caseCompare(UNICODE_STRING_SIMPLE("InCombiningMarksforSymbols"), 0) == 0) {
4446 mPropName = UNICODE_STRING_SIMPLE("InCombining Diacritical Marks for Symbols");
4447 }
4448 else if (mPropName.compare(UNICODE_STRING_SIMPLE("all")) == 0) {
4449 mPropName = UNICODE_STRING_SIMPLE("javaValidCodePoint");
4450 }
4451
4452 // See if the property looks like a Java "InBlockName", which
4453 // we will recast as "Block=BlockName"
4454 //
4455 static const UChar IN[] = {0x49, 0x6E, 0}; // "In"
4456 static const UChar BLOCK[] = {0x42, 0x6C, 0x6f, 0x63, 0x6b, 0x3d, 00}; // "Block="
4457 if (mPropName.startsWith(IN, 2) && propName.length()>=3) {
4458 setExpr.truncate(4); // Leaves "[\p{", or "[\P{"
4459 setExpr.append(BLOCK, -1);
4460 setExpr.append(UnicodeString(mPropName, 2)); // Property with the leading "In" removed.
4461 setExpr.append(chRBrace);
4462 setExpr.append(chRBracket);
4463 *fStatus = U_ZERO_ERROR;
4464 set = new UnicodeSet(setExpr, usetFlags, NULL, *fStatus);
4465 if (U_SUCCESS(*fStatus)) {
4466 return set;
4467 }
4468 delete set;
4469 set = NULL;
4470 }
4471
4472 if (propName.startsWith(UNICODE_STRING_SIMPLE("java")) ||
4473 propName.compare(UNICODE_STRING_SIMPLE("all")) == 0)
4474 {
4475 UErrorCode localStatus = U_ZERO_ERROR;
4476 //setExpr.remove();
4477 set = new UnicodeSet();
4478 //
4479 // Try the various Java specific properties.
4480 // These all begin with "java"
4481 //
4482 if (mPropName.compare(UNICODE_STRING_SIMPLE("javaDefined")) == 0) {
4483 addCategory(set, U_GC_CN_MASK, localStatus);
4484 set->complement();
4485 }
4486 else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaDigit")) == 0) {
4487 addCategory(set, U_GC_ND_MASK, localStatus);
4488 }
4489 else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaIdentifierIgnorable")) == 0) {
4490 addIdentifierIgnorable(set, localStatus);
4491 }
4492 else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaISOControl")) == 0) {
4493 set->add(0, 0x1F).add(0x7F, 0x9F);
4494 }
4495 else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaJavaIdentifierPart")) == 0) {
4496 addCategory(set, U_GC_L_MASK, localStatus);
4497 addCategory(set, U_GC_SC_MASK, localStatus);
4498 addCategory(set, U_GC_PC_MASK, localStatus);
4499 addCategory(set, U_GC_ND_MASK, localStatus);
4500 addCategory(set, U_GC_NL_MASK, localStatus);
4501 addCategory(set, U_GC_MC_MASK, localStatus);
4502 addCategory(set, U_GC_MN_MASK, localStatus);
4503 addIdentifierIgnorable(set, localStatus);
4504 }
4505 else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaJavaIdentifierStart")) == 0) {
4506 addCategory(set, U_GC_L_MASK, localStatus);
4507 addCategory(set, U_GC_NL_MASK, localStatus);
4508 addCategory(set, U_GC_SC_MASK, localStatus);
4509 addCategory(set, U_GC_PC_MASK, localStatus);
4510 }
4511 else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaLetter")) == 0) {
4512 addCategory(set, U_GC_L_MASK, localStatus);
4513 }
4514 else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaLetterOrDigit")) == 0) {
4515 addCategory(set, U_GC_L_MASK, localStatus);
4516 addCategory(set, U_GC_ND_MASK, localStatus);
4517 }
4518 else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaLowerCase")) == 0) {
4519 addCategory(set, U_GC_LL_MASK, localStatus);
4520 }
4521 else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaMirrored")) == 0) {
4522 set->applyIntPropertyValue(UCHAR_BIDI_MIRRORED, 1, localStatus);
4523 }
4524 else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaSpaceChar")) == 0) {
4525 addCategory(set, U_GC_Z_MASK, localStatus);
4526 }
4527 else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaSupplementaryCodePoint")) == 0) {
4528 set->add(0x10000, UnicodeSet::MAX_VALUE);
4529 }
4530 else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaTitleCase")) == 0) {
4531 addCategory(set, U_GC_LT_MASK, localStatus);
4532 }
4533 else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaUnicodeIdentifierStart")) == 0) {
4534 addCategory(set, U_GC_L_MASK, localStatus);
4535 addCategory(set, U_GC_NL_MASK, localStatus);
4536 }
4537 else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaUnicodeIdentifierPart")) == 0) {
4538 addCategory(set, U_GC_L_MASK, localStatus);
4539 addCategory(set, U_GC_PC_MASK, localStatus);
4540 addCategory(set, U_GC_ND_MASK, localStatus);
4541 addCategory(set, U_GC_NL_MASK, localStatus);
4542 addCategory(set, U_GC_MC_MASK, localStatus);
4543 addCategory(set, U_GC_MN_MASK, localStatus);
4544 addIdentifierIgnorable(set, localStatus);
4545 }
4546 else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaUpperCase")) == 0) {
4547 addCategory(set, U_GC_LU_MASK, localStatus);
4548 }
4549 else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaValidCodePoint")) == 0) {
4550 set->add(0, UnicodeSet::MAX_VALUE);
4551 }
4552 else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaWhitespace")) == 0) {
4553 addCategory(set, U_GC_Z_MASK, localStatus);
4554 set->removeAll(UnicodeSet().add(0xa0).add(0x2007).add(0x202f));
4555 set->add(9, 0x0d).add(0x1c, 0x1f);
4556 }
4557 else if (mPropName.compare(UNICODE_STRING_SIMPLE("all")) == 0) {
4558 set->add(0, UnicodeSet::MAX_VALUE);
4559 }
4560
4561 if (U_SUCCESS(localStatus) && !set->isEmpty()) {
4562 *fStatus = U_ZERO_ERROR;
4563 if (usetFlags & USET_CASE_INSENSITIVE) {
4564 set->closeOver(USET_CASE_INSENSITIVE);
4565 }
4566 if (negated) {
4567 set->complement();
4568 }
4569 return set;
4570 }
4571 delete set;
4572 set = NULL;
4573 }
4574 error(*fStatus);
4575 return NULL;
4576 }
4577
4578
4579
4580 //
4581 // SetEval Part of the evaluation of [set expressions].
4582 // Perform any pending (stacked) operations with precedence
4583 // equal or greater to that of the next operator encountered
4584 // in the expression.
4585 //
setEval(int32_t nextOp)4586 void RegexCompile::setEval(int32_t nextOp) {
4587 UnicodeSet *rightOperand = NULL;
4588 UnicodeSet *leftOperand = NULL;
4589 for (;;) {
4590 U_ASSERT(fSetOpStack.empty()==FALSE);
4591 int32_t pendingSetOperation = fSetOpStack.peeki();
4592 if ((pendingSetOperation&0xffff0000) < (nextOp&0xffff0000)) {
4593 break;
4594 }
4595 fSetOpStack.popi();
4596 U_ASSERT(fSetStack.empty() == FALSE);
4597 rightOperand = (UnicodeSet *)fSetStack.peek();
4598 switch (pendingSetOperation) {
4599 case setNegation:
4600 rightOperand->complement();
4601 break;
4602 case setCaseClose:
4603 // TODO: need a simple close function. Ticket 6065
4604 rightOperand->closeOver(USET_CASE_INSENSITIVE);
4605 rightOperand->removeAllStrings();
4606 break;
4607 case setDifference1:
4608 case setDifference2:
4609 fSetStack.pop();
4610 leftOperand = (UnicodeSet *)fSetStack.peek();
4611 leftOperand->removeAll(*rightOperand);
4612 delete rightOperand;
4613 break;
4614 case setIntersection1:
4615 case setIntersection2:
4616 fSetStack.pop();
4617 leftOperand = (UnicodeSet *)fSetStack.peek();
4618 leftOperand->retainAll(*rightOperand);
4619 delete rightOperand;
4620 break;
4621 case setUnion:
4622 fSetStack.pop();
4623 leftOperand = (UnicodeSet *)fSetStack.peek();
4624 leftOperand->addAll(*rightOperand);
4625 delete rightOperand;
4626 break;
4627 default:
4628 U_ASSERT(FALSE);
4629 break;
4630 }
4631 }
4632 }
4633
setPushOp(int32_t op)4634 void RegexCompile::setPushOp(int32_t op) {
4635 setEval(op);
4636 fSetOpStack.push(op, *fStatus);
4637 fSetStack.push(new UnicodeSet(), *fStatus);
4638 }
4639
4640 U_NAMESPACE_END
4641 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS
4642
4643