• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 //
4 //  file:  regexcmp.cpp
5 //
6 //  Copyright (C) 2002-2016 International Business Machines Corporation and others.
7 //  All Rights Reserved.
8 //
9 //  This file contains the ICU regular expression compiler, which is responsible
10 //  for processing a regular expression pattern into the compiled form that
11 //  is used by the match finding engine.
12 //
13 
14 #include "unicode/utypes.h"
15 
16 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
17 
18 #include "unicode/ustring.h"
19 #include "unicode/unistr.h"
20 #include "unicode/uniset.h"
21 #include "unicode/uchar.h"
22 #include "unicode/uchriter.h"
23 #include "unicode/parsepos.h"
24 #include "unicode/parseerr.h"
25 #include "unicode/regex.h"
26 #include "unicode/utf.h"
27 #include "unicode/utf16.h"
28 #include "patternprops.h"
29 #include "putilimp.h"
30 #include "cmemory.h"
31 #include "cstring.h"
32 #include "uvectr32.h"
33 #include "uvectr64.h"
34 #include "uassert.h"
35 #include "uinvchar.h"
36 
37 #include "regeximp.h"
38 #include "regexcst.h"   // Contains state table for the regex pattern parser.
39                         //   generated by a Perl script.
40 #include "regexcmp.h"
41 #include "regexst.h"
42 #include "regextxt.h"
43 
44 
45 
46 U_NAMESPACE_BEGIN
47 
48 
49 //------------------------------------------------------------------------------
50 //
51 //  Constructor.
52 //
53 //------------------------------------------------------------------------------
RegexCompile(RegexPattern * rxp,UErrorCode & status)54 RegexCompile::RegexCompile(RegexPattern *rxp, UErrorCode &status) :
55    fParenStack(status), fSetStack(status), fSetOpStack(status)
56 {
57     // Lazy init of all shared global sets (needed for init()'s empty text)
58     RegexStaticSets::initGlobals(&status);
59 
60     fStatus           = &status;
61 
62     fRXPat            = rxp;
63     fScanIndex        = 0;
64     fLastChar         = -1;
65     fPeekChar         = -1;
66     fLineNum          = 1;
67     fCharNum          = 0;
68     fQuoteMode        = FALSE;
69     fInBackslashQuote = FALSE;
70     fModeFlags        = fRXPat->fFlags | 0x80000000;
71     fEOLComments      = TRUE;
72 
73     fMatchOpenParen   = -1;
74     fMatchCloseParen  = -1;
75     fCaptureName      = NULL;
76     fLastSetLiteral   = U_SENTINEL;
77 
78     if (U_SUCCESS(status) && U_FAILURE(rxp->fDeferredStatus)) {
79         status = rxp->fDeferredStatus;
80     }
81 }
82 
83 static const UChar      chAmp       = 0x26;      // '&'
84 static const UChar      chDash      = 0x2d;      // '-'
85 
86 
87 //------------------------------------------------------------------------------
88 //
89 //  Destructor
90 //
91 //------------------------------------------------------------------------------
~RegexCompile()92 RegexCompile::~RegexCompile() {
93     delete fCaptureName;         // Normally will be NULL, but can exist if pattern
94                                  //   compilation stops with a syntax error.
95 }
96 
addCategory(UnicodeSet * set,int32_t value,UErrorCode & ec)97 static inline void addCategory(UnicodeSet *set, int32_t value, UErrorCode& ec) {
98     set->addAll(UnicodeSet().applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, value, ec));
99 }
100 
101 //------------------------------------------------------------------------------
102 //
103 //  Compile regex pattern.   The state machine for rexexp pattern parsing is here.
104 //                           The state tables are hand-written in the file regexcst.txt,
105 //                           and converted to the form used here by a perl
106 //                           script regexcst.pl
107 //
108 //------------------------------------------------------------------------------
compile(const UnicodeString & pat,UParseError & pp,UErrorCode & e)109 void    RegexCompile::compile(
110                          const UnicodeString &pat,   // Source pat to be compiled.
111                          UParseError &pp,            // Error position info
112                          UErrorCode &e)              // Error Code
113 {
114     fRXPat->fPatternString = new UnicodeString(pat);
115     UText patternText = UTEXT_INITIALIZER;
116     utext_openConstUnicodeString(&patternText, fRXPat->fPatternString, &e);
117 
118     if (U_SUCCESS(e)) {
119         compile(&patternText, pp, e);
120         utext_close(&patternText);
121     }
122 }
123 
124 //
125 //   compile, UText mode
126 //     All the work is actually done here.
127 //
compile(UText * pat,UParseError & pp,UErrorCode & e)128 void    RegexCompile::compile(
129                          UText *pat,                 // Source pat to be compiled.
130                          UParseError &pp,            // Error position info
131                          UErrorCode &e)              // Error Code
132 {
133     fStatus             = &e;
134     fParseErr           = &pp;
135     fStackPtr           = 0;
136     fStack[fStackPtr]   = 0;
137 
138     if (U_FAILURE(*fStatus)) {
139         return;
140     }
141 
142     // There should be no pattern stuff in the RegexPattern object.  They can not be reused.
143     U_ASSERT(fRXPat->fPattern == NULL || utext_nativeLength(fRXPat->fPattern) == 0);
144 
145     // Prepare the RegexPattern object to receive the compiled pattern.
146     fRXPat->fPattern        = utext_clone(fRXPat->fPattern, pat, FALSE, TRUE, fStatus);
147     if (U_FAILURE(*fStatus)) {
148         return;
149     }
150     fRXPat->fStaticSets     = RegexStaticSets::gStaticSets->fPropSets;
151     fRXPat->fStaticSets8    = RegexStaticSets::gStaticSets->fPropSets8;
152 
153 
154     // Initialize the pattern scanning state machine
155     fPatternLength = utext_nativeLength(pat);
156     uint16_t                state = 1;
157     const RegexTableEl      *tableEl;
158 
159     // UREGEX_LITERAL force entire pattern to be treated as a literal string.
160     if (fModeFlags & UREGEX_LITERAL) {
161         fQuoteMode = TRUE;
162     }
163 
164     nextChar(fC);                        // Fetch the first char from the pattern string.
165 
166     //
167     // Main loop for the regex pattern parsing state machine.
168     //   Runs once per state transition.
169     //   Each time through optionally performs, depending on the state table,
170     //      - an advance to the the next pattern char
171     //      - an action to be performed.
172     //      - pushing or popping a state to/from the local state return stack.
173     //   file regexcst.txt is the source for the state table.  The logic behind
174     //     recongizing the pattern syntax is there, not here.
175     //
176     for (;;) {
177         //  Bail out if anything has gone wrong.
178         //  Regex pattern parsing stops on the first error encountered.
179         if (U_FAILURE(*fStatus)) {
180             break;
181         }
182 
183         U_ASSERT(state != 0);
184 
185         // Find the state table element that matches the input char from the pattern, or the
186         //    class of the input character.  Start with the first table row for this
187         //    state, then linearly scan forward until we find a row that matches the
188         //    character.  The last row for each state always matches all characters, so
189         //    the search will stop there, if not before.
190         //
191         tableEl = &gRuleParseStateTable[state];
192         REGEX_SCAN_DEBUG_PRINTF(("char, line, col = (\'%c\', %d, %d)    state=%s ",
193             fC.fChar, fLineNum, fCharNum, RegexStateNames[state]));
194 
195         for (;;) {    // loop through table rows belonging to this state, looking for one
196                       //   that matches the current input char.
197             REGEX_SCAN_DEBUG_PRINTF(("."));
198             if (tableEl->fCharClass < 127 && fC.fQuoted == FALSE &&   tableEl->fCharClass == fC.fChar) {
199                 // Table row specified an individual character, not a set, and
200                 //   the input character is not quoted, and
201                 //   the input character matched it.
202                 break;
203             }
204             if (tableEl->fCharClass == 255) {
205                 // Table row specified default, match anything character class.
206                 break;
207             }
208             if (tableEl->fCharClass == 254 && fC.fQuoted)  {
209                 // Table row specified "quoted" and the char was quoted.
210                 break;
211             }
212             if (tableEl->fCharClass == 253 && fC.fChar == (UChar32)-1)  {
213                 // Table row specified eof and we hit eof on the input.
214                 break;
215             }
216 
217             if (tableEl->fCharClass >= 128 && tableEl->fCharClass < 240 &&   // Table specs a char class &&
218                 fC.fQuoted == FALSE &&                                       //   char is not escaped &&
219                 fC.fChar != (UChar32)-1) {                                   //   char is not EOF
220                 U_ASSERT(tableEl->fCharClass <= 137);
221                 if (RegexStaticSets::gStaticSets->fRuleSets[tableEl->fCharClass-128].contains(fC.fChar)) {
222                     // Table row specified a character class, or set of characters,
223                     //   and the current char matches it.
224                     break;
225                 }
226             }
227 
228             // No match on this row, advance to the next  row for this state,
229             tableEl++;
230         }
231         REGEX_SCAN_DEBUG_PRINTF(("\n"));
232 
233         //
234         // We've found the row of the state table that matches the current input
235         //   character from the rules string.
236         // Perform any action specified  by this row in the state table.
237         if (doParseActions(tableEl->fAction) == FALSE) {
238             // Break out of the state machine loop if the
239             //   the action signalled some kind of error, or
240             //   the action was to exit, occurs on normal end-of-rules-input.
241             break;
242         }
243 
244         if (tableEl->fPushState != 0) {
245             fStackPtr++;
246             if (fStackPtr >= kStackSize) {
247                 error(U_REGEX_INTERNAL_ERROR);
248                 REGEX_SCAN_DEBUG_PRINTF(("RegexCompile::parse() - state stack overflow.\n"));
249                 fStackPtr--;
250             }
251             fStack[fStackPtr] = tableEl->fPushState;
252         }
253 
254         //
255         //  NextChar.  This is where characters are actually fetched from the pattern.
256         //             Happens under control of the 'n' tag in the state table.
257         //
258         if (tableEl->fNextChar) {
259             nextChar(fC);
260         }
261 
262         // Get the next state from the table entry, or from the
263         //   state stack if the next state was specified as "pop".
264         if (tableEl->fNextState != 255) {
265             state = tableEl->fNextState;
266         } else {
267             state = fStack[fStackPtr];
268             fStackPtr--;
269             if (fStackPtr < 0) {
270                 // state stack underflow
271                 // This will occur if the user pattern has mis-matched parentheses,
272                 //   with extra close parens.
273                 //
274                 fStackPtr++;
275                 error(U_REGEX_MISMATCHED_PAREN);
276             }
277         }
278 
279     }
280 
281     if (U_FAILURE(*fStatus)) {
282         // Bail out if the pattern had errors.
283         //   Set stack cleanup:  a successful compile would have left it empty,
284         //   but errors can leave temporary sets hanging around.
285         while (!fSetStack.empty()) {
286             delete (UnicodeSet *)fSetStack.pop();
287         }
288         return;
289     }
290 
291     //
292     // The pattern has now been read and processed, and the compiled code generated.
293     //
294 
295     //
296     // The pattern's fFrameSize so far has accumulated the requirements for
297     //   storage for capture parentheses, counters, etc. that are encountered
298     //   in the pattern.  Add space for the two variables that are always
299     //   present in the saved state:  the input string position (int64_t) and
300     //   the position in the compiled pattern.
301     //
302     allocateStackData(RESTACKFRAME_HDRCOUNT);
303 
304     //
305     // Optimization pass 1: NOPs, back-references, and case-folding
306     //
307     stripNOPs();
308 
309     //
310     // Get bounds for the minimum and maximum length of a string that this
311     //   pattern can match.  Used to avoid looking for matches in strings that
312     //   are too short.
313     //
314     fRXPat->fMinMatchLen = minMatchLength(3, fRXPat->fCompiledPat->size()-1);
315 
316     //
317     // Optimization pass 2: match start type
318     //
319     matchStartType();
320 
321     //
322     // Set up fast latin-1 range sets
323     //
324     int32_t numSets = fRXPat->fSets->size();
325     fRXPat->fSets8 = new Regex8BitSet[numSets];
326     // Null pointer check.
327     if (fRXPat->fSets8 == NULL) {
328         e = *fStatus = U_MEMORY_ALLOCATION_ERROR;
329         return;
330     }
331     int32_t i;
332     for (i=0; i<numSets; i++) {
333         UnicodeSet *s = (UnicodeSet *)fRXPat->fSets->elementAt(i);
334         fRXPat->fSets8[i].init(s);
335     }
336 
337 }
338 
339 
340 
341 
342 
343 //------------------------------------------------------------------------------
344 //
345 //  doParseAction        Do some action during regex pattern parsing.
346 //                       Called by the parse state machine.
347 //
348 //                       Generation of the match engine PCode happens here, or
349 //                       in functions called from the parse actions defined here.
350 //
351 //
352 //------------------------------------------------------------------------------
doParseActions(int32_t action)353 UBool RegexCompile::doParseActions(int32_t action)
354 {
355     UBool   returnVal = TRUE;
356 
357     switch ((Regex_PatternParseAction)action) {
358 
359     case doPatStart:
360         // Start of pattern compiles to:
361         //0   SAVE   2        Fall back to position of FAIL
362         //1   jmp    3
363         //2   FAIL            Stop if we ever reach here.
364         //3   NOP             Dummy, so start of pattern looks the same as
365         //                    the start of an ( grouping.
366         //4   NOP             Resreved, will be replaced by a save if there are
367         //                    OR | operators at the top level
368         appendOp(URX_STATE_SAVE, 2);
369         appendOp(URX_JMP,  3);
370         appendOp(URX_FAIL, 0);
371 
372         // Standard open nonCapture paren action emits the two NOPs and
373         //   sets up the paren stack frame.
374         doParseActions(doOpenNonCaptureParen);
375         break;
376 
377     case doPatFinish:
378         // We've scanned to the end of the pattern
379         //  The end of pattern compiles to:
380         //        URX_END
381         //    which will stop the runtime match engine.
382         //  Encountering end of pattern also behaves like a close paren,
383         //   and forces fixups of the State Save at the beginning of the compiled pattern
384         //   and of any OR operations at the top level.
385         //
386         handleCloseParen();
387         if (fParenStack.size() > 0) {
388             // Missing close paren in pattern.
389             error(U_REGEX_MISMATCHED_PAREN);
390         }
391 
392         // add the END operation to the compiled pattern.
393         appendOp(URX_END, 0);
394 
395         // Terminate the pattern compilation state machine.
396         returnVal = FALSE;
397         break;
398 
399 
400 
401     case doOrOperator:
402         // Scanning a '|', as in (A|B)
403         {
404             // Generate code for any pending literals preceding the '|'
405             fixLiterals(FALSE);
406 
407             // Insert a SAVE operation at the start of the pattern section preceding
408             //   this OR at this level.  This SAVE will branch the match forward
409             //   to the right hand side of the OR in the event that the left hand
410             //   side fails to match and backtracks.  Locate the position for the
411             //   save from the location on the top of the parentheses stack.
412             int32_t savePosition = fParenStack.popi();
413             int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(savePosition);
414             U_ASSERT(URX_TYPE(op) == URX_NOP);  // original contents of reserved location
415             op = buildOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+1);
416             fRXPat->fCompiledPat->setElementAt(op, savePosition);
417 
418             // Append an JMP operation into the compiled pattern.  The operand for
419             //  the JMP will eventually be the location following the ')' for the
420             //  group.  This will be patched in later, when the ')' is encountered.
421             appendOp(URX_JMP, 0);
422 
423             // Push the position of the newly added JMP op onto the parentheses stack.
424             // This registers if for fixup when this block's close paren is encountered.
425             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);
426 
427             // Append a NOP to the compiled pattern.  This is the slot reserved
428             //   for a SAVE in the event that there is yet another '|' following
429             //   this one.
430             appendOp(URX_NOP, 0);
431             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);
432         }
433         break;
434 
435 
436     case doBeginNamedCapture:
437         // Scanning (?<letter.
438         //   The first letter of the name will come through again under doConinueNamedCapture.
439         fCaptureName = new UnicodeString();
440         if (fCaptureName == NULL) {
441             error(U_MEMORY_ALLOCATION_ERROR);
442         }
443         break;
444 
445     case  doContinueNamedCapture:
446         fCaptureName->append(fC.fChar);
447         break;
448 
449     case doBadNamedCapture:
450         error(U_REGEX_INVALID_CAPTURE_GROUP_NAME);
451         break;
452 
453     case doOpenCaptureParen:
454         // Open Capturing Paren, possibly named.
455         //   Compile to a
456         //      - NOP, which later may be replaced by a save-state if the
457         //         parenthesized group gets a * quantifier, followed by
458         //      - START_CAPTURE  n    where n is stack frame offset to the capture group variables.
459         //      - NOP, which may later be replaced by a save-state if there
460         //             is an '|' alternation within the parens.
461         //
462         //    Each capture group gets three slots in the save stack frame:
463         //         0: Capture Group start position (in input string being matched.)
464         //         1: Capture Group end position.
465         //         2: Start of Match-in-progress.
466         //    The first two locations are for a completed capture group, and are
467         //     referred to by back references and the like.
468         //    The third location stores the capture start position when an START_CAPTURE is
469         //      encountered.  This will be promoted to a completed capture when (and if) the corresponding
470         //      END_CAPTURE is encountered.
471         {
472             fixLiterals();
473             appendOp(URX_NOP, 0);
474             int32_t  varsLoc = allocateStackData(3);    // Reserve three slots in match stack frame.
475             appendOp(URX_START_CAPTURE, varsLoc);
476             appendOp(URX_NOP, 0);
477 
478             // On the Parentheses stack, start a new frame and add the postions
479             //   of the two NOPs.  Depending on what follows in the pattern, the
480             //   NOPs may be changed to SAVE_STATE or JMP ops, with a target
481             //   address of the end of the parenthesized group.
482             fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
483             fParenStack.push(capturing, *fStatus);                        // Frame type.
484             fParenStack.push(fRXPat->fCompiledPat->size()-3, *fStatus);   // The first  NOP location
485             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP loc
486 
487             // Save the mapping from group number to stack frame variable position.
488             fRXPat->fGroupMap->addElement(varsLoc, *fStatus);
489 
490             // If this is a named capture group, add the name->group number mapping.
491             if (fCaptureName != NULL) {
492                 int32_t groupNumber = fRXPat->fGroupMap->size();
493                 int32_t previousMapping = uhash_puti(fRXPat->fNamedCaptureMap, fCaptureName, groupNumber, fStatus);
494                 fCaptureName = NULL;    // hash table takes ownership of the name (key) string.
495                 if (previousMapping > 0 && U_SUCCESS(*fStatus)) {
496                     error(U_REGEX_INVALID_CAPTURE_GROUP_NAME);
497                 }
498             }
499         }
500         break;
501 
502     case doOpenNonCaptureParen:
503         // Open non-caputuring (grouping only) Paren.
504         //   Compile to a
505         //      - NOP, which later may be replaced by a save-state if the
506         //         parenthesized group gets a * quantifier, followed by
507         //      - NOP, which may later be replaced by a save-state if there
508         //             is an '|' alternation within the parens.
509         {
510             fixLiterals();
511             appendOp(URX_NOP, 0);
512             appendOp(URX_NOP, 0);
513 
514             // On the Parentheses stack, start a new frame and add the postions
515             //   of the two NOPs.
516             fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
517             fParenStack.push(plain,      *fStatus);                       // Begin a new frame.
518             fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The first  NOP location
519             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP loc
520         }
521          break;
522 
523 
524     case doOpenAtomicParen:
525         // Open Atomic Paren.  (?>
526         //   Compile to a
527         //      - NOP, which later may be replaced if the parenthesized group
528         //         has a quantifier, followed by
529         //      - STO_SP  save state stack position, so it can be restored at the ")"
530         //      - NOP, which may later be replaced by a save-state if there
531         //             is an '|' alternation within the parens.
532         {
533             fixLiterals();
534             appendOp(URX_NOP, 0);
535             int32_t  varLoc = allocateData(1);    // Reserve a data location for saving the state stack ptr.
536             appendOp(URX_STO_SP, varLoc);
537             appendOp(URX_NOP, 0);
538 
539             // On the Parentheses stack, start a new frame and add the postions
540             //   of the two NOPs.  Depending on what follows in the pattern, the
541             //   NOPs may be changed to SAVE_STATE or JMP ops, with a target
542             //   address of the end of the parenthesized group.
543             fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
544             fParenStack.push(atomic, *fStatus);                           // Frame type.
545             fParenStack.push(fRXPat->fCompiledPat->size()-3, *fStatus);   // The first NOP
546             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP
547         }
548         break;
549 
550 
551     case doOpenLookAhead:
552         // Positive Look-ahead   (?=  stuff  )
553         //
554         //   Note:   Addition of transparent input regions, with the need to
555         //           restore the original regions when failing out of a lookahead
556         //           block, complicated this sequence.  Some conbined opcodes
557         //           might make sense - or might not, lookahead aren't that common.
558         //
559         //      Caution:  min match length optimization knows about this
560         //               sequence; don't change without making updates there too.
561         //
562         // Compiles to
563         //    1    START_LA     dataLoc     Saves SP, Input Pos
564         //    2.   STATE_SAVE   4            on failure of lookahead, goto 4
565         //    3    JMP          6           continue ...
566         //
567         //    4.   LA_END                   Look Ahead failed.  Restore regions.
568         //    5.   BACKTRACK                and back track again.
569         //
570         //    6.   NOP              reserved for use by quantifiers on the block.
571         //                          Look-ahead can't have quantifiers, but paren stack
572         //                             compile time conventions require the slot anyhow.
573         //    7.   NOP              may be replaced if there is are '|' ops in the block.
574         //    8.     code for parenthesized stuff.
575         //    9.   LA_END
576         //
577         //  Two data slots are reserved, for saving the stack ptr and the input position.
578         {
579             fixLiterals();
580             int32_t dataLoc = allocateData(2);
581             appendOp(URX_LA_START, dataLoc);
582             appendOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+ 2);
583             appendOp(URX_JMP, fRXPat->fCompiledPat->size()+ 3);
584             appendOp(URX_LA_END, dataLoc);
585             appendOp(URX_BACKTRACK, 0);
586             appendOp(URX_NOP, 0);
587             appendOp(URX_NOP, 0);
588 
589             // On the Parentheses stack, start a new frame and add the postions
590             //   of the NOPs.
591             fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
592             fParenStack.push(lookAhead, *fStatus);                        // Frame type.
593             fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The first  NOP location
594             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP location
595         }
596         break;
597 
598     case doOpenLookAheadNeg:
599         // Negated Lookahead.   (?! stuff )
600         // Compiles to
601         //    1.    START_LA    dataloc
602         //    2.    SAVE_STATE  7         // Fail within look-ahead block restores to this state,
603         //                                //   which continues with the match.
604         //    3.    NOP                   // Std. Open Paren sequence, for possible '|'
605         //    4.       code for parenthesized stuff.
606         //    5.    END_LA                // Cut back stack, remove saved state from step 2.
607         //    6.    BACKTRACK             // code in block succeeded, so neg. lookahead fails.
608         //    7.    END_LA                // Restore match region, in case look-ahead was using
609         //                                        an alternate (transparent) region.
610         {
611             fixLiterals();
612             int32_t dataLoc = allocateData(2);
613             appendOp(URX_LA_START, dataLoc);
614             appendOp(URX_STATE_SAVE, 0);    // dest address will be patched later.
615             appendOp(URX_NOP, 0);
616 
617             // On the Parentheses stack, start a new frame and add the postions
618             //   of the StateSave and NOP.
619             fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
620             fParenStack.push(negLookAhead, *fStatus);                    // Frame type
621             fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The STATE_SAVE location
622             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP location
623 
624             // Instructions #5 - #7 will be added when the ')' is encountered.
625         }
626         break;
627 
628     case doOpenLookBehind:
629         {
630             //   Compile a (?<= look-behind open paren.
631             //
632             //          Compiles to
633             //              0       URX_LB_START     dataLoc
634             //              1       URX_LB_CONT      dataLoc
635             //              2                        MinMatchLen
636             //              3                        MaxMatchLen
637             //              4       URX_NOP          Standard '(' boilerplate.
638             //              5       URX_NOP          Reserved slot for use with '|' ops within (block).
639             //              6         <code for LookBehind expression>
640             //              7       URX_LB_END       dataLoc    # Check match len, restore input  len
641             //              8       URX_LA_END       dataLoc    # Restore stack, input pos
642             //
643             //          Allocate a block of matcher data, to contain (when running a match)
644             //              0:    Stack ptr on entry
645             //              1:    Input Index on entry
646             //              2:    Start index of match current match attempt.
647             //              3:    Original Input String len.
648 
649             // Generate match code for any pending literals.
650             fixLiterals();
651 
652             // Allocate data space
653             int32_t dataLoc = allocateData(4);
654 
655             // Emit URX_LB_START
656             appendOp(URX_LB_START, dataLoc);
657 
658             // Emit URX_LB_CONT
659             appendOp(URX_LB_CONT, dataLoc);
660             appendOp(URX_RESERVED_OP, 0);    // MinMatchLength.  To be filled later.
661             appendOp(URX_RESERVED_OP, 0);    // MaxMatchLength.  To be filled later.
662 
663             // Emit the NOPs
664             appendOp(URX_NOP, 0);
665             appendOp(URX_NOP, 0);
666 
667             // On the Parentheses stack, start a new frame and add the postions
668             //   of the URX_LB_CONT and the NOP.
669             fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
670             fParenStack.push(lookBehind, *fStatus);                       // Frame type
671             fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The first NOP location
672             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The 2nd   NOP location
673 
674             // The final two instructions will be added when the ')' is encountered.
675         }
676 
677         break;
678 
679     case doOpenLookBehindNeg:
680         {
681             //   Compile a (?<! negated look-behind open paren.
682             //
683             //          Compiles to
684             //              0       URX_LB_START     dataLoc    # Save entry stack, input len
685             //              1       URX_LBN_CONT     dataLoc    # Iterate possible match positions
686             //              2                        MinMatchLen
687             //              3                        MaxMatchLen
688             //              4                        continueLoc (9)
689             //              5       URX_NOP          Standard '(' boilerplate.
690             //              6       URX_NOP          Reserved slot for use with '|' ops within (block).
691             //              7         <code for LookBehind expression>
692             //              8       URX_LBN_END      dataLoc    # Check match len, cause a FAIL
693             //              9       ...
694             //
695             //          Allocate a block of matcher data, to contain (when running a match)
696             //              0:    Stack ptr on entry
697             //              1:    Input Index on entry
698             //              2:    Start index of match current match attempt.
699             //              3:    Original Input String len.
700 
701             // Generate match code for any pending literals.
702             fixLiterals();
703 
704             // Allocate data space
705             int32_t dataLoc = allocateData(4);
706 
707             // Emit URX_LB_START
708             appendOp(URX_LB_START, dataLoc);
709 
710             // Emit URX_LBN_CONT
711             appendOp(URX_LBN_CONT, dataLoc);
712             appendOp(URX_RESERVED_OP, 0);    // MinMatchLength.  To be filled later.
713             appendOp(URX_RESERVED_OP, 0);    // MaxMatchLength.  To be filled later.
714             appendOp(URX_RESERVED_OP, 0);    // Continue Loc.    To be filled later.
715 
716             // Emit the NOPs
717             appendOp(URX_NOP, 0);
718             appendOp(URX_NOP, 0);
719 
720             // On the Parentheses stack, start a new frame and add the postions
721             //   of the URX_LB_CONT and the NOP.
722             fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
723             fParenStack.push(lookBehindN, *fStatus);                      // Frame type
724             fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The first NOP location
725             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The 2nd   NOP location
726 
727             // The final two instructions will be added when the ')' is encountered.
728         }
729         break;
730 
731     case doConditionalExpr:
732         // Conditionals such as (?(1)a:b)
733     case doPerlInline:
734         // Perl inline-condtionals.  (?{perl code}a|b) We're not perl, no way to do them.
735         error(U_REGEX_UNIMPLEMENTED);
736         break;
737 
738 
739     case doCloseParen:
740         handleCloseParen();
741         if (fParenStack.size() <= 0) {
742             //  Extra close paren, or missing open paren.
743             error(U_REGEX_MISMATCHED_PAREN);
744         }
745         break;
746 
747     case doNOP:
748         break;
749 
750 
751     case doBadOpenParenType:
752     case doRuleError:
753         error(U_REGEX_RULE_SYNTAX);
754         break;
755 
756 
757     case doMismatchedParenErr:
758         error(U_REGEX_MISMATCHED_PAREN);
759         break;
760 
761     case doPlus:
762         //  Normal '+'  compiles to
763         //     1.   stuff to be repeated  (already built)
764         //     2.   jmp-sav 1
765         //     3.   ...
766         //
767         //  Or, if the item to be repeated can match a zero length string,
768         //     1.   STO_INP_LOC  data-loc
769         //     2.      body of stuff to be repeated
770         //     3.   JMP_SAV_X    2
771         //     4.   ...
772 
773         //
774         //  Or, if the item to be repeated is simple
775         //     1.   Item to be repeated.
776         //     2.   LOOP_SR_I    set number  (assuming repeated item is a set ref)
777         //     3.   LOOP_C       stack location
778         {
779             int32_t  topLoc = blockTopLoc(FALSE);        // location of item #1
780             int32_t  frameLoc;
781 
782             // Check for simple constructs, which may get special optimized code.
783             if (topLoc == fRXPat->fCompiledPat->size() - 1) {
784                 int32_t repeatedOp = (int32_t)fRXPat->fCompiledPat->elementAti(topLoc);
785 
786                 if (URX_TYPE(repeatedOp) == URX_SETREF) {
787                     // Emit optimized code for [char set]+
788                     appendOp(URX_LOOP_SR_I, URX_VAL(repeatedOp));
789                     frameLoc = allocateStackData(1);
790                     appendOp(URX_LOOP_C, frameLoc);
791                     break;
792                 }
793 
794                 if (URX_TYPE(repeatedOp) == URX_DOTANY ||
795                     URX_TYPE(repeatedOp) == URX_DOTANY_ALL ||
796                     URX_TYPE(repeatedOp) == URX_DOTANY_UNIX) {
797                     // Emit Optimized code for .+ operations.
798                     int32_t loopOpI = buildOp(URX_LOOP_DOT_I, 0);
799                     if (URX_TYPE(repeatedOp) == URX_DOTANY_ALL) {
800                         // URX_LOOP_DOT_I operand is a flag indicating ". matches any" mode.
801                         loopOpI |= 1;
802                     }
803                     if (fModeFlags & UREGEX_UNIX_LINES) {
804                         loopOpI |= 2;
805                     }
806                     appendOp(loopOpI);
807                     frameLoc = allocateStackData(1);
808                     appendOp(URX_LOOP_C, frameLoc);
809                     break;
810                 }
811 
812             }
813 
814             // General case.
815 
816             // Check for minimum match length of zero, which requires
817             //    extra loop-breaking code.
818             if (minMatchLength(topLoc, fRXPat->fCompiledPat->size()-1) == 0) {
819                 // Zero length match is possible.
820                 // Emit the code sequence that can handle it.
821                 insertOp(topLoc);
822                 frameLoc = allocateStackData(1);
823 
824                 int32_t op = buildOp(URX_STO_INP_LOC, frameLoc);
825                 fRXPat->fCompiledPat->setElementAt(op, topLoc);
826 
827                 appendOp(URX_JMP_SAV_X, topLoc+1);
828             } else {
829                 // Simpler code when the repeated body must match something non-empty
830                 appendOp(URX_JMP_SAV, topLoc);
831             }
832         }
833         break;
834 
835     case doNGPlus:
836         //  Non-greedy '+?'  compiles to
837         //     1.   stuff to be repeated  (already built)
838         //     2.   state-save  1
839         //     3.   ...
840         {
841             int32_t topLoc      = blockTopLoc(FALSE);
842             appendOp(URX_STATE_SAVE, topLoc);
843         }
844         break;
845 
846 
847     case doOpt:
848         // Normal (greedy) ? quantifier.
849         //  Compiles to
850         //     1. state save 3
851         //     2.    body of optional block
852         //     3. ...
853         // Insert the state save into the compiled pattern, and we're done.
854         {
855             int32_t   saveStateLoc = blockTopLoc(TRUE);
856             int32_t   saveStateOp  = buildOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size());
857             fRXPat->fCompiledPat->setElementAt(saveStateOp, saveStateLoc);
858         }
859         break;
860 
861     case doNGOpt:
862         // Non-greedy ?? quantifier
863         //   compiles to
864         //    1.  jmp   4
865         //    2.     body of optional block
866         //    3   jmp   5
867         //    4.  state save 2
868         //    5    ...
869         //  This code is less than ideal, with two jmps instead of one, because we can only
870         //  insert one instruction at the top of the block being iterated.
871         {
872             int32_t  jmp1_loc = blockTopLoc(TRUE);
873             int32_t  jmp2_loc = fRXPat->fCompiledPat->size();
874 
875             int32_t  jmp1_op  = buildOp(URX_JMP, jmp2_loc+1);
876             fRXPat->fCompiledPat->setElementAt(jmp1_op, jmp1_loc);
877 
878             appendOp(URX_JMP, jmp2_loc+2);
879 
880             appendOp(URX_STATE_SAVE, jmp1_loc+1);
881         }
882         break;
883 
884 
885     case doStar:
886         // Normal (greedy) * quantifier.
887         // Compiles to
888         //       1.   STATE_SAVE   4
889         //       2.      body of stuff being iterated over
890         //       3.   JMP_SAV      2
891         //       4.   ...
892         //
893         // Or, if the body is a simple [Set],
894         //       1.   LOOP_SR_I    set number
895         //       2.   LOOP_C       stack location
896         //       ...
897         //
898         // Or if this is a .*
899         //       1.   LOOP_DOT_I    (. matches all mode flag)
900         //       2.   LOOP_C        stack location
901         //
902         // Or, if the body can match a zero-length string, to inhibit infinite loops,
903         //       1.   STATE_SAVE   5
904         //       2.   STO_INP_LOC  data-loc
905         //       3.      body of stuff
906         //       4.   JMP_SAV_X    2
907         //       5.   ...
908         {
909             // location of item #1, the STATE_SAVE
910             int32_t   topLoc = blockTopLoc(FALSE);
911             int32_t   dataLoc = -1;
912 
913             // Check for simple *, where the construct being repeated
914             //   compiled to single opcode, and might be optimizable.
915             if (topLoc == fRXPat->fCompiledPat->size() - 1) {
916                 int32_t repeatedOp = (int32_t)fRXPat->fCompiledPat->elementAti(topLoc);
917 
918                 if (URX_TYPE(repeatedOp) == URX_SETREF) {
919                     // Emit optimized code for a [char set]*
920                     int32_t loopOpI = buildOp(URX_LOOP_SR_I, URX_VAL(repeatedOp));
921                     fRXPat->fCompiledPat->setElementAt(loopOpI, topLoc);
922                     dataLoc = allocateStackData(1);
923                     appendOp(URX_LOOP_C, dataLoc);
924                     break;
925                 }
926 
927                 if (URX_TYPE(repeatedOp) == URX_DOTANY ||
928                     URX_TYPE(repeatedOp) == URX_DOTANY_ALL ||
929                     URX_TYPE(repeatedOp) == URX_DOTANY_UNIX) {
930                     // Emit Optimized code for .* operations.
931                     int32_t loopOpI = buildOp(URX_LOOP_DOT_I, 0);
932                     if (URX_TYPE(repeatedOp) == URX_DOTANY_ALL) {
933                         // URX_LOOP_DOT_I operand is a flag indicating . matches any mode.
934                         loopOpI |= 1;
935                     }
936                     if ((fModeFlags & UREGEX_UNIX_LINES) != 0) {
937                         loopOpI |= 2;
938                     }
939                     fRXPat->fCompiledPat->setElementAt(loopOpI, topLoc);
940                     dataLoc = allocateStackData(1);
941                     appendOp(URX_LOOP_C, dataLoc);
942                     break;
943                 }
944             }
945 
946             // Emit general case code for this *
947             // The optimizations did not apply.
948 
949             int32_t   saveStateLoc = blockTopLoc(TRUE);
950             int32_t   jmpOp        = buildOp(URX_JMP_SAV, saveStateLoc+1);
951 
952             // Check for minimum match length of zero, which requires
953             //    extra loop-breaking code.
954             if (minMatchLength(saveStateLoc, fRXPat->fCompiledPat->size()-1) == 0) {
955                 insertOp(saveStateLoc);
956                 dataLoc = allocateStackData(1);
957 
958                 int32_t op = buildOp(URX_STO_INP_LOC, dataLoc);
959                 fRXPat->fCompiledPat->setElementAt(op, saveStateLoc+1);
960                 jmpOp      = buildOp(URX_JMP_SAV_X, saveStateLoc+2);
961             }
962 
963             // Locate the position in the compiled pattern where the match will continue
964             //   after completing the *.   (4 or 5 in the comment above)
965             int32_t continueLoc = fRXPat->fCompiledPat->size()+1;
966 
967             // Put together the save state op and store it into the compiled code.
968             int32_t saveStateOp = buildOp(URX_STATE_SAVE, continueLoc);
969             fRXPat->fCompiledPat->setElementAt(saveStateOp, saveStateLoc);
970 
971             // Append the URX_JMP_SAV or URX_JMPX operation to the compiled pattern.
972             appendOp(jmpOp);
973         }
974         break;
975 
976     case doNGStar:
977         // Non-greedy *? quantifier
978         // compiles to
979         //     1.   JMP    3
980         //     2.      body of stuff being iterated over
981         //     3.   STATE_SAVE  2
982         //     4    ...
983         {
984             int32_t     jmpLoc  = blockTopLoc(TRUE);                   // loc  1.
985             int32_t     saveLoc = fRXPat->fCompiledPat->size();        // loc  3.
986             int32_t     jmpOp   = buildOp(URX_JMP, saveLoc);
987             fRXPat->fCompiledPat->setElementAt(jmpOp, jmpLoc);
988             appendOp(URX_STATE_SAVE, jmpLoc+1);
989         }
990         break;
991 
992 
993     case doIntervalInit:
994         // The '{' opening an interval quantifier was just scanned.
995         // Init the counter varaiables that will accumulate the values as the digits
996         //    are scanned.
997         fIntervalLow = 0;
998         fIntervalUpper = -1;
999         break;
1000 
1001     case doIntevalLowerDigit:
1002         // Scanned a digit from the lower value of an {lower,upper} interval
1003         {
1004             int32_t digitValue = u_charDigitValue(fC.fChar);
1005             U_ASSERT(digitValue >= 0);
1006             int64_t val = (int64_t)fIntervalLow*10 + digitValue;
1007             if (val > INT32_MAX) {
1008                 error(U_REGEX_NUMBER_TOO_BIG);
1009             } else {
1010                 fIntervalLow = (int32_t)val;
1011             }
1012         }
1013         break;
1014 
1015     case doIntervalUpperDigit:
1016         // Scanned a digit from the upper value of an {lower,upper} interval
1017         {
1018             if (fIntervalUpper < 0) {
1019                 fIntervalUpper = 0;
1020             }
1021             int32_t digitValue = u_charDigitValue(fC.fChar);
1022             U_ASSERT(digitValue >= 0);
1023             int64_t val = (int64_t)fIntervalUpper*10 + digitValue;
1024             if (val > INT32_MAX) {
1025                 error(U_REGEX_NUMBER_TOO_BIG);
1026             } else {
1027                 fIntervalUpper = (int32_t)val;
1028             }
1029         }
1030         break;
1031 
1032     case doIntervalSame:
1033         // Scanned a single value interval like {27}.  Upper = Lower.
1034         fIntervalUpper = fIntervalLow;
1035         break;
1036 
1037     case doInterval:
1038         // Finished scanning a normal {lower,upper} interval.  Generate the code for it.
1039         if (compileInlineInterval() == FALSE) {
1040             compileInterval(URX_CTR_INIT, URX_CTR_LOOP);
1041         }
1042         break;
1043 
1044     case doPossessiveInterval:
1045         // Finished scanning a Possessive {lower,upper}+ interval.  Generate the code for it.
1046         {
1047             // Remember the loc for the top of the block being looped over.
1048             //   (Can not reserve a slot in the compiled pattern at this time, because
1049             //    compileInterval needs to reserve also, and blockTopLoc can only reserve
1050             //    once per block.)
1051             int32_t topLoc = blockTopLoc(FALSE);
1052 
1053             // Produce normal looping code.
1054             compileInterval(URX_CTR_INIT, URX_CTR_LOOP);
1055 
1056             // Surround the just-emitted normal looping code with a STO_SP ... LD_SP
1057             //  just as if the loop was inclosed in atomic parentheses.
1058 
1059             // First the STO_SP before the start of the loop
1060             insertOp(topLoc);
1061 
1062             int32_t  varLoc = allocateData(1);   // Reserve a data location for saving the
1063             int32_t  op     = buildOp(URX_STO_SP, varLoc);
1064             fRXPat->fCompiledPat->setElementAt(op, topLoc);
1065 
1066             int32_t loopOp = (int32_t)fRXPat->fCompiledPat->popi();
1067             U_ASSERT(URX_TYPE(loopOp) == URX_CTR_LOOP && URX_VAL(loopOp) == topLoc);
1068             loopOp++;     // point LoopOp after the just-inserted STO_SP
1069             fRXPat->fCompiledPat->push(loopOp, *fStatus);
1070 
1071             // Then the LD_SP after the end of the loop
1072             appendOp(URX_LD_SP, varLoc);
1073         }
1074 
1075         break;
1076 
1077     case doNGInterval:
1078         // Finished scanning a non-greedy {lower,upper}? interval.  Generate the code for it.
1079         compileInterval(URX_CTR_INIT_NG, URX_CTR_LOOP_NG);
1080         break;
1081 
1082     case doIntervalError:
1083         error(U_REGEX_BAD_INTERVAL);
1084         break;
1085 
1086     case doLiteralChar:
1087         // We've just scanned a "normal" character from the pattern,
1088         literalChar(fC.fChar);
1089         break;
1090 
1091 
1092     case doEscapedLiteralChar:
1093         // We've just scanned an backslashed escaped character with  no
1094         //   special meaning.  It represents itself.
1095         if ((fModeFlags & UREGEX_ERROR_ON_UNKNOWN_ESCAPES) != 0 &&
1096             ((fC.fChar >= 0x41 && fC.fChar<= 0x5A) ||     // in [A-Z]
1097             (fC.fChar >= 0x61 && fC.fChar <= 0x7a))) {   // in [a-z]
1098                error(U_REGEX_BAD_ESCAPE_SEQUENCE);
1099              }
1100         literalChar(fC.fChar);
1101         break;
1102 
1103 
1104     case doDotAny:
1105         // scanned a ".",  match any single character.
1106         {
1107             fixLiterals(FALSE);
1108             if (fModeFlags & UREGEX_DOTALL) {
1109                 appendOp(URX_DOTANY_ALL, 0);
1110             } else if (fModeFlags & UREGEX_UNIX_LINES) {
1111                 appendOp(URX_DOTANY_UNIX, 0);
1112             } else {
1113                 appendOp(URX_DOTANY, 0);
1114             }
1115         }
1116         break;
1117 
1118     case doCaret:
1119         {
1120             fixLiterals(FALSE);
1121             if (       (fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) {
1122                 appendOp(URX_CARET, 0);
1123             } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) {
1124                 appendOp(URX_CARET_M, 0);
1125             } else if ((fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) {
1126                 appendOp(URX_CARET, 0);   // Only testing true start of input.
1127             } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) {
1128                 appendOp(URX_CARET_M_UNIX, 0);
1129             }
1130         }
1131         break;
1132 
1133     case doDollar:
1134         {
1135             fixLiterals(FALSE);
1136             if (       (fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) {
1137                 appendOp(URX_DOLLAR, 0);
1138             } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) {
1139                 appendOp(URX_DOLLAR_M, 0);
1140             } else if ((fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) {
1141                 appendOp(URX_DOLLAR_D, 0);
1142             } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) {
1143                 appendOp(URX_DOLLAR_MD, 0);
1144             }
1145         }
1146         break;
1147 
1148     case doBackslashA:
1149         fixLiterals(FALSE);
1150         appendOp(URX_CARET, 0);
1151         break;
1152 
1153     case doBackslashB:
1154         {
1155             #if  UCONFIG_NO_BREAK_ITERATION==1
1156             if (fModeFlags & UREGEX_UWORD) {
1157                 error(U_UNSUPPORTED_ERROR);
1158             }
1159             #endif
1160             fixLiterals(FALSE);
1161             int32_t op = (fModeFlags & UREGEX_UWORD)? URX_BACKSLASH_BU : URX_BACKSLASH_B;
1162             appendOp(op, 1);
1163         }
1164         break;
1165 
1166     case doBackslashb:
1167         {
1168             #if  UCONFIG_NO_BREAK_ITERATION==1
1169             if (fModeFlags & UREGEX_UWORD) {
1170                 error(U_UNSUPPORTED_ERROR);
1171             }
1172             #endif
1173             fixLiterals(FALSE);
1174             int32_t op = (fModeFlags & UREGEX_UWORD)? URX_BACKSLASH_BU : URX_BACKSLASH_B;
1175             appendOp(op, 0);
1176         }
1177         break;
1178 
1179     case doBackslashD:
1180         fixLiterals(FALSE);
1181         appendOp(URX_BACKSLASH_D, 1);
1182         break;
1183 
1184     case doBackslashd:
1185         fixLiterals(FALSE);
1186         appendOp(URX_BACKSLASH_D, 0);
1187         break;
1188 
1189     case doBackslashG:
1190         fixLiterals(FALSE);
1191         appendOp(URX_BACKSLASH_G, 0);
1192         break;
1193 
1194     case doBackslashH:
1195         fixLiterals(FALSE);
1196         appendOp(URX_BACKSLASH_H, 1);
1197         break;
1198 
1199     case doBackslashh:
1200         fixLiterals(FALSE);
1201         appendOp(URX_BACKSLASH_H, 0);
1202         break;
1203 
1204     case doBackslashR:
1205         fixLiterals(FALSE);
1206         appendOp(URX_BACKSLASH_R, 0);
1207         break;
1208 
1209     case doBackslashS:
1210         fixLiterals(FALSE);
1211         appendOp(URX_STAT_SETREF_N, URX_ISSPACE_SET);
1212         break;
1213 
1214     case doBackslashs:
1215         fixLiterals(FALSE);
1216         appendOp(URX_STATIC_SETREF, URX_ISSPACE_SET);
1217         break;
1218 
1219     case doBackslashV:
1220         fixLiterals(FALSE);
1221         appendOp(URX_BACKSLASH_V, 1);
1222         break;
1223 
1224     case doBackslashv:
1225         fixLiterals(FALSE);
1226         appendOp(URX_BACKSLASH_V, 0);
1227         break;
1228 
1229     case doBackslashW:
1230         fixLiterals(FALSE);
1231         appendOp(URX_STAT_SETREF_N, URX_ISWORD_SET);
1232         break;
1233 
1234     case doBackslashw:
1235         fixLiterals(FALSE);
1236         appendOp(URX_STATIC_SETREF, URX_ISWORD_SET);
1237         break;
1238 
1239     case doBackslashX:
1240         fixLiterals(FALSE);
1241         appendOp(URX_BACKSLASH_X, 0);
1242         break;
1243 
1244 
1245     case doBackslashZ:
1246         fixLiterals(FALSE);
1247         appendOp(URX_DOLLAR, 0);
1248         break;
1249 
1250     case doBackslashz:
1251         fixLiterals(FALSE);
1252         appendOp(URX_BACKSLASH_Z, 0);
1253         break;
1254 
1255     case doEscapeError:
1256         error(U_REGEX_BAD_ESCAPE_SEQUENCE);
1257         break;
1258 
1259     case doExit:
1260         fixLiterals(FALSE);
1261         returnVal = FALSE;
1262         break;
1263 
1264     case doProperty:
1265         {
1266             fixLiterals(FALSE);
1267             UnicodeSet *theSet = scanProp();
1268             compileSet(theSet);
1269         }
1270         break;
1271 
1272     case doNamedChar:
1273         {
1274             UChar32 c = scanNamedChar();
1275             literalChar(c);
1276         }
1277         break;
1278 
1279 
1280     case doBackRef:
1281         // BackReference.  Somewhat unusual in that the front-end can not completely parse
1282         //                 the regular expression, because the number of digits to be consumed
1283         //                 depends on the number of capture groups that have been defined.  So
1284         //                 we have to do it here instead.
1285         {
1286             int32_t  numCaptureGroups = fRXPat->fGroupMap->size();
1287             int32_t  groupNum = 0;
1288             UChar32  c        = fC.fChar;
1289 
1290             for (;;) {
1291                 // Loop once per digit, for max allowed number of digits in a back reference.
1292                 int32_t digit = u_charDigitValue(c);
1293                 groupNum = groupNum * 10 + digit;
1294                 if (groupNum >= numCaptureGroups) {
1295                     break;
1296                 }
1297                 c = peekCharLL();
1298                 if (RegexStaticSets::gStaticSets->fRuleDigitsAlias->contains(c) == FALSE) {
1299                     break;
1300                 }
1301                 nextCharLL();
1302             }
1303 
1304             // Scan of the back reference in the source regexp is complete.  Now generate
1305             //  the compiled code for it.
1306             // Because capture groups can be forward-referenced by back-references,
1307             //  we fill the operand with the capture group number.  At the end
1308             //  of compilation, it will be changed to the variable's location.
1309             U_ASSERT(groupNum > 0);  // Shouldn't happen.  '\0' begins an octal escape sequence,
1310                                      //    and shouldn't enter this code path at all.
1311             fixLiterals(FALSE);
1312             if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
1313                 appendOp(URX_BACKREF_I, groupNum);
1314             } else {
1315                 appendOp(URX_BACKREF, groupNum);
1316             }
1317         }
1318         break;
1319 
1320     case doBeginNamedBackRef:
1321         U_ASSERT(fCaptureName == NULL);
1322         fCaptureName = new UnicodeString;
1323         if (fCaptureName == NULL) {
1324             error(U_MEMORY_ALLOCATION_ERROR);
1325         }
1326         break;
1327 
1328     case doContinueNamedBackRef:
1329         fCaptureName->append(fC.fChar);
1330         break;
1331 
1332     case doCompleteNamedBackRef:
1333         {
1334         int32_t groupNumber = uhash_geti(fRXPat->fNamedCaptureMap, fCaptureName);
1335         if (groupNumber == 0) {
1336             // Group name has not been defined.
1337             //   Could be a forward reference. If we choose to support them at some
1338             //   future time, extra mechanism will be required at this point.
1339             error(U_REGEX_INVALID_CAPTURE_GROUP_NAME);
1340         } else {
1341             // Given the number, handle identically to a \n numbered back reference.
1342             // See comments above, under doBackRef
1343             fixLiterals(FALSE);
1344             if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
1345                 appendOp(URX_BACKREF_I, groupNumber);
1346             } else {
1347                 appendOp(URX_BACKREF, groupNumber);
1348             }
1349         }
1350         delete fCaptureName;
1351         fCaptureName = NULL;
1352         break;
1353         }
1354 
1355     case doPossessivePlus:
1356         // Possessive ++ quantifier.
1357         // Compiles to
1358         //       1.   STO_SP
1359         //       2.      body of stuff being iterated over
1360         //       3.   STATE_SAVE 5
1361         //       4.   JMP        2
1362         //       5.   LD_SP
1363         //       6.   ...
1364         //
1365         //  Note:  TODO:  This is pretty inefficient.  A mass of saved state is built up
1366         //                then unconditionally discarded.  Perhaps introduce a new opcode.  Ticket 6056
1367         //
1368         {
1369             // Emit the STO_SP
1370             int32_t   topLoc = blockTopLoc(TRUE);
1371             int32_t   stoLoc = allocateData(1);  // Reserve the data location for storing save stack ptr.
1372             int32_t   op     = buildOp(URX_STO_SP, stoLoc);
1373             fRXPat->fCompiledPat->setElementAt(op, topLoc);
1374 
1375             // Emit the STATE_SAVE
1376             appendOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+2);
1377 
1378             // Emit the JMP
1379             appendOp(URX_JMP, topLoc+1);
1380 
1381             // Emit the LD_SP
1382             appendOp(URX_LD_SP, stoLoc);
1383         }
1384         break;
1385 
1386     case doPossessiveStar:
1387         // Possessive *+ quantifier.
1388         // Compiles to
1389         //       1.   STO_SP       loc
1390         //       2.   STATE_SAVE   5
1391         //       3.      body of stuff being iterated over
1392         //       4.   JMP          2
1393         //       5.   LD_SP        loc
1394         //       6    ...
1395         // TODO:  do something to cut back the state stack each time through the loop.
1396         {
1397             // Reserve two slots at the top of the block.
1398             int32_t   topLoc = blockTopLoc(TRUE);
1399             insertOp(topLoc);
1400 
1401             // emit   STO_SP     loc
1402             int32_t   stoLoc = allocateData(1);    // Reserve the data location for storing save stack ptr.
1403             int32_t   op     = buildOp(URX_STO_SP, stoLoc);
1404             fRXPat->fCompiledPat->setElementAt(op, topLoc);
1405 
1406             // Emit the SAVE_STATE   5
1407             int32_t L7 = fRXPat->fCompiledPat->size()+1;
1408             op = buildOp(URX_STATE_SAVE, L7);
1409             fRXPat->fCompiledPat->setElementAt(op, topLoc+1);
1410 
1411             // Append the JMP operation.
1412             appendOp(URX_JMP, topLoc+1);
1413 
1414             // Emit the LD_SP       loc
1415             appendOp(URX_LD_SP, stoLoc);
1416         }
1417         break;
1418 
1419     case doPossessiveOpt:
1420         // Possessive  ?+ quantifier.
1421         //  Compiles to
1422         //     1. STO_SP      loc
1423         //     2. SAVE_STATE  5
1424         //     3.    body of optional block
1425         //     4. LD_SP       loc
1426         //     5. ...
1427         //
1428         {
1429             // Reserve two slots at the top of the block.
1430             int32_t   topLoc = blockTopLoc(TRUE);
1431             insertOp(topLoc);
1432 
1433             // Emit the STO_SP
1434             int32_t   stoLoc = allocateData(1);   // Reserve the data location for storing save stack ptr.
1435             int32_t   op     = buildOp(URX_STO_SP, stoLoc);
1436             fRXPat->fCompiledPat->setElementAt(op, topLoc);
1437 
1438             // Emit the SAVE_STATE
1439             int32_t   continueLoc = fRXPat->fCompiledPat->size()+1;
1440             op = buildOp(URX_STATE_SAVE, continueLoc);
1441             fRXPat->fCompiledPat->setElementAt(op, topLoc+1);
1442 
1443             // Emit the LD_SP
1444             appendOp(URX_LD_SP, stoLoc);
1445         }
1446         break;
1447 
1448 
1449     case doBeginMatchMode:
1450         fNewModeFlags = fModeFlags;
1451         fSetModeFlag  = TRUE;
1452         break;
1453 
1454     case doMatchMode:   //  (?i)    and similar
1455         {
1456             int32_t  bit = 0;
1457             switch (fC.fChar) {
1458             case 0x69: /* 'i' */   bit = UREGEX_CASE_INSENSITIVE; break;
1459             case 0x64: /* 'd' */   bit = UREGEX_UNIX_LINES;       break;
1460             case 0x6d: /* 'm' */   bit = UREGEX_MULTILINE;        break;
1461             case 0x73: /* 's' */   bit = UREGEX_DOTALL;           break;
1462             case 0x75: /* 'u' */   bit = 0; /* Unicode casing */  break;
1463             case 0x77: /* 'w' */   bit = UREGEX_UWORD;            break;
1464             case 0x78: /* 'x' */   bit = UREGEX_COMMENTS;         break;
1465             case 0x2d: /* '-' */   fSetModeFlag = FALSE;          break;
1466             default:
1467                 U_ASSERT(FALSE);   // Should never happen.  Other chars are filtered out
1468                                    // by the scanner.
1469             }
1470             if (fSetModeFlag) {
1471                 fNewModeFlags |= bit;
1472             } else {
1473                 fNewModeFlags &= ~bit;
1474             }
1475         }
1476         break;
1477 
1478     case doSetMatchMode:
1479         // Emit code to match any pending literals, using the not-yet changed match mode.
1480         fixLiterals();
1481 
1482         // We've got a (?i) or similar.  The match mode is being changed, but
1483         //   the change is not scoped to a parenthesized block.
1484         U_ASSERT(fNewModeFlags < 0);
1485         fModeFlags = fNewModeFlags;
1486 
1487         break;
1488 
1489 
1490     case doMatchModeParen:
1491         // We've got a (?i: or similar.  Begin a parenthesized block, save old
1492         //   mode flags so they can be restored at the close of the block.
1493         //
1494         //   Compile to a
1495         //      - NOP, which later may be replaced by a save-state if the
1496         //         parenthesized group gets a * quantifier, followed by
1497         //      - NOP, which may later be replaced by a save-state if there
1498         //             is an '|' alternation within the parens.
1499         {
1500             fixLiterals(FALSE);
1501             appendOp(URX_NOP, 0);
1502             appendOp(URX_NOP, 0);
1503 
1504             // On the Parentheses stack, start a new frame and add the postions
1505             //   of the two NOPs (a normal non-capturing () frame, except for the
1506             //   saving of the orignal mode flags.)
1507             fParenStack.push(fModeFlags, *fStatus);
1508             fParenStack.push(flags, *fStatus);                            // Frame Marker
1509             fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The first NOP
1510             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP
1511 
1512             // Set the current mode flags to the new values.
1513             U_ASSERT(fNewModeFlags < 0);
1514             fModeFlags = fNewModeFlags;
1515         }
1516         break;
1517 
1518     case doBadModeFlag:
1519         error(U_REGEX_INVALID_FLAG);
1520         break;
1521 
1522     case doSuppressComments:
1523         // We have just scanned a '(?'.  We now need to prevent the character scanner from
1524         // treating a '#' as a to-the-end-of-line comment.
1525         //   (This Perl compatibility just gets uglier and uglier to do...)
1526         fEOLComments = FALSE;
1527         break;
1528 
1529 
1530     case doSetAddAmp:
1531         {
1532           UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1533           set->add(chAmp);
1534         }
1535         break;
1536 
1537     case doSetAddDash:
1538         {
1539           UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1540           set->add(chDash);
1541         }
1542         break;
1543 
1544      case doSetBackslash_s:
1545         {
1546          UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1547          set->addAll(*RegexStaticSets::gStaticSets->fPropSets[URX_ISSPACE_SET]);
1548          break;
1549         }
1550 
1551      case doSetBackslash_S:
1552         {
1553             UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1554             UnicodeSet SSet(*RegexStaticSets::gStaticSets->fPropSets[URX_ISSPACE_SET]);
1555             SSet.complement();
1556             set->addAll(SSet);
1557             break;
1558         }
1559 
1560     case doSetBackslash_d:
1561         {
1562             UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1563             // TODO - make a static set, ticket 6058.
1564             addCategory(set, U_GC_ND_MASK, *fStatus);
1565             break;
1566         }
1567 
1568     case doSetBackslash_D:
1569         {
1570             UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1571             UnicodeSet digits;
1572             // TODO - make a static set, ticket 6058.
1573             digits.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ND_MASK, *fStatus);
1574             digits.complement();
1575             set->addAll(digits);
1576             break;
1577         }
1578 
1579     case doSetBackslash_h:
1580         {
1581             UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1582             UnicodeSet h;
1583             h.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ZS_MASK, *fStatus);
1584             h.add((UChar32)9);   // Tab
1585             set->addAll(h);
1586             break;
1587         }
1588 
1589     case doSetBackslash_H:
1590         {
1591             UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1592             UnicodeSet h;
1593             h.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ZS_MASK, *fStatus);
1594             h.add((UChar32)9);   // Tab
1595             h.complement();
1596             set->addAll(h);
1597             break;
1598         }
1599 
1600     case doSetBackslash_v:
1601         {
1602             UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1603             set->add((UChar32)0x0a, (UChar32)0x0d);  // add range
1604             set->add((UChar32)0x85);
1605             set->add((UChar32)0x2028, (UChar32)0x2029);
1606             break;
1607         }
1608 
1609     case doSetBackslash_V:
1610         {
1611             UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1612             UnicodeSet v;
1613             v.add((UChar32)0x0a, (UChar32)0x0d);  // add range
1614             v.add((UChar32)0x85);
1615             v.add((UChar32)0x2028, (UChar32)0x2029);
1616             v.complement();
1617             set->addAll(v);
1618             break;
1619         }
1620 
1621     case doSetBackslash_w:
1622         {
1623             UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1624             set->addAll(*RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET]);
1625             break;
1626         }
1627 
1628     case doSetBackslash_W:
1629         {
1630             UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1631             UnicodeSet SSet(*RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET]);
1632             SSet.complement();
1633             set->addAll(SSet);
1634             break;
1635         }
1636 
1637     case doSetBegin:
1638         fixLiterals(FALSE);
1639         fSetStack.push(new UnicodeSet(), *fStatus);
1640         fSetOpStack.push(setStart, *fStatus);
1641         if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) {
1642             fSetOpStack.push(setCaseClose, *fStatus);
1643         }
1644         break;
1645 
1646     case doSetBeginDifference1:
1647         //  We have scanned something like [[abc]-[
1648         //  Set up a new UnicodeSet for the set beginning with the just-scanned '['
1649         //  Push a Difference operator, which will cause the new set to be subtracted from what
1650         //    went before once it is created.
1651         setPushOp(setDifference1);
1652         fSetOpStack.push(setStart, *fStatus);
1653         if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) {
1654             fSetOpStack.push(setCaseClose, *fStatus);
1655         }
1656         break;
1657 
1658     case doSetBeginIntersection1:
1659         //  We have scanned something like  [[abc]&[
1660         //   Need both the '&' operator and the open '[' operator.
1661         setPushOp(setIntersection1);
1662         fSetOpStack.push(setStart, *fStatus);
1663         if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) {
1664             fSetOpStack.push(setCaseClose, *fStatus);
1665         }
1666         break;
1667 
1668     case doSetBeginUnion:
1669         //  We have scanned something like  [[abc][
1670         //     Need to handle the union operation explicitly [[abc] | [
1671         setPushOp(setUnion);
1672         fSetOpStack.push(setStart, *fStatus);
1673         if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) {
1674             fSetOpStack.push(setCaseClose, *fStatus);
1675         }
1676         break;
1677 
1678     case doSetDifference2:
1679         // We have scanned something like [abc--
1680         //   Consider this to unambiguously be a set difference operator.
1681         setPushOp(setDifference2);
1682         break;
1683 
1684     case doSetEnd:
1685         // Have encountered the ']' that closes a set.
1686         //    Force the evaluation of any pending operations within this set,
1687         //    leave the completed set on the top of the set stack.
1688         setEval(setEnd);
1689         U_ASSERT(fSetOpStack.peeki()==setStart);
1690         fSetOpStack.popi();
1691         break;
1692 
1693     case doSetFinish:
1694         {
1695         // Finished a complete set expression, including all nested sets.
1696         //   The close bracket has already triggered clearing out pending set operators,
1697         //    the operator stack should be empty and the operand stack should have just
1698         //    one entry, the result set.
1699         U_ASSERT(fSetOpStack.empty());
1700         UnicodeSet *theSet = (UnicodeSet *)fSetStack.pop();
1701         U_ASSERT(fSetStack.empty());
1702         compileSet(theSet);
1703         break;
1704         }
1705 
1706     case doSetIntersection2:
1707         // Have scanned something like [abc&&
1708         setPushOp(setIntersection2);
1709         break;
1710 
1711     case doSetLiteral:
1712         // Union the just-scanned literal character into the set being built.
1713         //    This operation is the highest precedence set operation, so we can always do
1714         //    it immediately, without waiting to see what follows.  It is necessary to perform
1715         //    any pending '-' or '&' operation first, because these have the same precedence
1716         //    as union-ing in a literal'
1717         {
1718             setEval(setUnion);
1719             UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
1720             s->add(fC.fChar);
1721             fLastSetLiteral = fC.fChar;
1722             break;
1723         }
1724 
1725     case doSetLiteralEscaped:
1726         // A back-slash escaped literal character was encountered.
1727         // Processing is the same as with setLiteral, above, with the addition of
1728         //  the optional check for errors on escaped ASCII letters.
1729         {
1730             if ((fModeFlags & UREGEX_ERROR_ON_UNKNOWN_ESCAPES) != 0 &&
1731                 ((fC.fChar >= 0x41 && fC.fChar<= 0x5A) ||     // in [A-Z]
1732                  (fC.fChar >= 0x61 && fC.fChar <= 0x7a))) {   // in [a-z]
1733                 error(U_REGEX_BAD_ESCAPE_SEQUENCE);
1734             }
1735             setEval(setUnion);
1736             UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
1737             s->add(fC.fChar);
1738             fLastSetLiteral = fC.fChar;
1739             break;
1740         }
1741 
1742         case doSetNamedChar:
1743         // Scanning a \N{UNICODE CHARACTER NAME}
1744         //  Aside from the source of the character, the processing is identical to doSetLiteral,
1745         //    above.
1746         {
1747             UChar32  c = scanNamedChar();
1748             setEval(setUnion);
1749             UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
1750             s->add(c);
1751             fLastSetLiteral = c;
1752             break;
1753         }
1754 
1755     case doSetNamedRange:
1756         // We have scanned literal-\N{CHAR NAME}.  Add the range to the set.
1757         // The left character is already in the set, and is saved in fLastSetLiteral.
1758         // The right side needs to be picked up, the scan is at the 'N'.
1759         // Lower Limit > Upper limit being an error matches both Java
1760         //        and ICU UnicodeSet behavior.
1761         {
1762             UChar32  c = scanNamedChar();
1763             if (U_SUCCESS(*fStatus) && (fLastSetLiteral == U_SENTINEL || fLastSetLiteral > c)) {
1764                 error(U_REGEX_INVALID_RANGE);
1765             }
1766             UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
1767             s->add(fLastSetLiteral, c);
1768             fLastSetLiteral = c;
1769             break;
1770         }
1771 
1772 
1773     case  doSetNegate:
1774         // Scanned a '^' at the start of a set.
1775         // Push the negation operator onto the set op stack.
1776         // A twist for case-insensitive matching:
1777         //   the case closure operation must happen _before_ negation.
1778         //   But the case closure operation will already be on the stack if it's required.
1779         //   This requires checking for case closure, and swapping the stack order
1780         //    if it is present.
1781         {
1782             int32_t  tosOp = fSetOpStack.peeki();
1783             if (tosOp == setCaseClose) {
1784                 fSetOpStack.popi();
1785                 fSetOpStack.push(setNegation, *fStatus);
1786                 fSetOpStack.push(setCaseClose, *fStatus);
1787             } else {
1788                 fSetOpStack.push(setNegation, *fStatus);
1789             }
1790         }
1791         break;
1792 
1793     case doSetNoCloseError:
1794         error(U_REGEX_MISSING_CLOSE_BRACKET);
1795         break;
1796 
1797     case doSetOpError:
1798         error(U_REGEX_RULE_SYNTAX);   //  -- or && at the end of a set.  Illegal.
1799         break;
1800 
1801     case doSetPosixProp:
1802         {
1803             UnicodeSet *s = scanPosixProp();
1804             if (s != NULL) {
1805                 UnicodeSet *tos = (UnicodeSet *)fSetStack.peek();
1806                 tos->addAll(*s);
1807                 delete s;
1808             }  // else error.  scanProp() reported the error status already.
1809         }
1810         break;
1811 
1812     case doSetProp:
1813         //  Scanned a \p \P within [brackets].
1814         {
1815             UnicodeSet *s = scanProp();
1816             if (s != NULL) {
1817                 UnicodeSet *tos = (UnicodeSet *)fSetStack.peek();
1818                 tos->addAll(*s);
1819                 delete s;
1820             }  // else error.  scanProp() reported the error status already.
1821         }
1822         break;
1823 
1824 
1825     case doSetRange:
1826         // We have scanned literal-literal.  Add the range to the set.
1827         // The left character is already in the set, and is saved in fLastSetLiteral.
1828         // The right side is the current character.
1829         // Lower Limit > Upper limit being an error matches both Java
1830         //        and ICU UnicodeSet behavior.
1831         {
1832 
1833         if (fLastSetLiteral == U_SENTINEL || fLastSetLiteral > fC.fChar) {
1834             error(U_REGEX_INVALID_RANGE);
1835         }
1836         UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
1837         s->add(fLastSetLiteral, fC.fChar);
1838         break;
1839         }
1840 
1841     default:
1842         U_ASSERT(FALSE);
1843         error(U_REGEX_INTERNAL_ERROR);
1844         break;
1845     }
1846 
1847     if (U_FAILURE(*fStatus)) {
1848         returnVal = FALSE;
1849     }
1850 
1851     return returnVal;
1852 }
1853 
1854 
1855 
1856 //------------------------------------------------------------------------------
1857 //
1858 //   literalChar           We've encountered a literal character from the pattern,
1859 //                             or an escape sequence that reduces to a character.
1860 //                         Add it to the string containing all literal chars/strings from
1861 //                             the pattern.
1862 //
1863 //------------------------------------------------------------------------------
literalChar(UChar32 c)1864 void RegexCompile::literalChar(UChar32 c)  {
1865     fLiteralChars.append(c);
1866 }
1867 
1868 
1869 //------------------------------------------------------------------------------
1870 //
1871 //    fixLiterals           When compiling something that can follow a literal
1872 //                          string in a pattern, emit the code to match the
1873 //                          accumulated literal string.
1874 //
1875 //                          Optionally, split the last char of the string off into
1876 //                          a single "ONE_CHAR" operation, so that quantifiers can
1877 //                          apply to that char alone.  Example:   abc*
1878 //                          The * must apply to the 'c' only.
1879 //
1880 //------------------------------------------------------------------------------
fixLiterals(UBool split)1881 void    RegexCompile::fixLiterals(UBool split) {
1882 
1883     // If no literal characters have been scanned but not yet had code generated
1884     //   for them, nothing needs to be done.
1885     if (fLiteralChars.length() == 0) {
1886         return;
1887     }
1888 
1889     int32_t indexOfLastCodePoint = fLiteralChars.moveIndex32(fLiteralChars.length(), -1);
1890     UChar32 lastCodePoint = fLiteralChars.char32At(indexOfLastCodePoint);
1891 
1892     // Split:  We need to  ensure that the last item in the compiled pattern
1893     //     refers only to the last literal scanned in the pattern, so that
1894     //     quantifiers (*, +, etc.) affect only it, and not a longer string.
1895     //     Split before case folding for case insensitive matches.
1896 
1897     if (split) {
1898         fLiteralChars.truncate(indexOfLastCodePoint);
1899         fixLiterals(FALSE);   // Recursive call, emit code to match the first part of the string.
1900                               //  Note that the truncated literal string may be empty, in which case
1901                               //  nothing will be emitted.
1902 
1903         literalChar(lastCodePoint);  // Re-add the last code point as if it were a new literal.
1904         fixLiterals(FALSE);          // Second recursive call, code for the final code point.
1905         return;
1906     }
1907 
1908     // If we are doing case-insensitive matching, case fold the string.  This may expand
1909     //   the string, e.g. the German sharp-s turns into "ss"
1910     if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
1911         fLiteralChars.foldCase();
1912         indexOfLastCodePoint = fLiteralChars.moveIndex32(fLiteralChars.length(), -1);
1913         lastCodePoint = fLiteralChars.char32At(indexOfLastCodePoint);
1914     }
1915 
1916     if (indexOfLastCodePoint == 0) {
1917         // Single character, emit a URX_ONECHAR op to match it.
1918         if ((fModeFlags & UREGEX_CASE_INSENSITIVE) &&
1919                  u_hasBinaryProperty(lastCodePoint, UCHAR_CASE_SENSITIVE)) {
1920             appendOp(URX_ONECHAR_I, lastCodePoint);
1921         } else {
1922             appendOp(URX_ONECHAR, lastCodePoint);
1923         }
1924     } else {
1925         // Two or more chars, emit a URX_STRING to match them.
1926         if (fLiteralChars.length() > 0x00ffffff || fRXPat->fLiteralText.length() > 0x00ffffff) {
1927             error(U_REGEX_PATTERN_TOO_BIG);
1928         }
1929         if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
1930             appendOp(URX_STRING_I, fRXPat->fLiteralText.length());
1931         } else {
1932             // TODO here:  add optimization to split case sensitive strings of length two
1933             //             into two single char ops, for efficiency.
1934             appendOp(URX_STRING, fRXPat->fLiteralText.length());
1935         }
1936         appendOp(URX_STRING_LEN, fLiteralChars.length());
1937 
1938         // Add this string into the accumulated strings of the compiled pattern.
1939         fRXPat->fLiteralText.append(fLiteralChars);
1940     }
1941 
1942     fLiteralChars.remove();
1943 }
1944 
1945 
buildOp(int32_t type,int32_t val)1946 int32_t RegexCompile::buildOp(int32_t type, int32_t val) {
1947     if (U_FAILURE(*fStatus)) {
1948         return 0;
1949     }
1950     if (type < 0 || type > 255) {
1951         U_ASSERT(FALSE);
1952         error(U_REGEX_INTERNAL_ERROR);
1953         type = URX_RESERVED_OP;
1954     }
1955     if (val > 0x00ffffff) {
1956         U_ASSERT(FALSE);
1957         error(U_REGEX_INTERNAL_ERROR);
1958         val = 0;
1959     }
1960     if (val < 0) {
1961         if (!(type == URX_RESERVED_OP_N || type == URX_RESERVED_OP)) {
1962             U_ASSERT(FALSE);
1963             error(U_REGEX_INTERNAL_ERROR);
1964             return -1;
1965         }
1966         if (URX_TYPE(val) != 0xff) {
1967             U_ASSERT(FALSE);
1968             error(U_REGEX_INTERNAL_ERROR);
1969             return -1;
1970         }
1971         type = URX_RESERVED_OP_N;
1972     }
1973     return (type << 24) | val;
1974 }
1975 
1976 
1977 //------------------------------------------------------------------------------
1978 //
1979 //   appendOp()             Append a new instruction onto the compiled pattern
1980 //                          Includes error checking, limiting the size of the
1981 //                          pattern to lengths that can be represented in the
1982 //                          24 bit operand field of an instruction.
1983 //
1984 //------------------------------------------------------------------------------
appendOp(int32_t op)1985 void RegexCompile::appendOp(int32_t op) {
1986     if (U_FAILURE(*fStatus)) {
1987         return;
1988     }
1989     fRXPat->fCompiledPat->addElement(op, *fStatus);
1990     if ((fRXPat->fCompiledPat->size() > 0x00fffff0) && U_SUCCESS(*fStatus)) {
1991         error(U_REGEX_PATTERN_TOO_BIG);
1992     }
1993 }
1994 
appendOp(int32_t type,int32_t val)1995 void RegexCompile::appendOp(int32_t type, int32_t val) {
1996     appendOp(buildOp(type, val));
1997 }
1998 
1999 
2000 //------------------------------------------------------------------------------
2001 //
2002 //   insertOp()             Insert a slot for a new opcode into the already
2003 //                          compiled pattern code.
2004 //
2005 //                          Fill the slot with a NOP.  Our caller will replace it
2006 //                          with what they really wanted.
2007 //
2008 //------------------------------------------------------------------------------
insertOp(int32_t where)2009 void   RegexCompile::insertOp(int32_t where) {
2010     UVector64 *code = fRXPat->fCompiledPat;
2011     U_ASSERT(where>0 && where < code->size());
2012 
2013     int32_t  nop = buildOp(URX_NOP, 0);
2014     code->insertElementAt(nop, where, *fStatus);
2015 
2016     // Walk through the pattern, looking for any ops with targets that
2017     //  were moved down by the insert.  Fix them.
2018     int32_t loc;
2019     for (loc=0; loc<code->size(); loc++) {
2020         int32_t op = (int32_t)code->elementAti(loc);
2021         int32_t opType = URX_TYPE(op);
2022         int32_t opValue = URX_VAL(op);
2023         if ((opType == URX_JMP         ||
2024             opType == URX_JMPX         ||
2025             opType == URX_STATE_SAVE   ||
2026             opType == URX_CTR_LOOP     ||
2027             opType == URX_CTR_LOOP_NG  ||
2028             opType == URX_JMP_SAV      ||
2029             opType == URX_JMP_SAV_X    ||
2030             opType == URX_RELOC_OPRND)    && opValue > where) {
2031             // Target location for this opcode is after the insertion point and
2032             //   needs to be incremented to adjust for the insertion.
2033             opValue++;
2034             op = buildOp(opType, opValue);
2035             code->setElementAt(op, loc);
2036         }
2037     }
2038 
2039     // Now fix up the parentheses stack.  All positive values in it are locations in
2040     //  the compiled pattern.   (Negative values are frame boundaries, and don't need fixing.)
2041     for (loc=0; loc<fParenStack.size(); loc++) {
2042         int32_t x = fParenStack.elementAti(loc);
2043         U_ASSERT(x < code->size());
2044         if (x>where) {
2045             x++;
2046             fParenStack.setElementAt(x, loc);
2047         }
2048     }
2049 
2050     if (fMatchCloseParen > where) {
2051         fMatchCloseParen++;
2052     }
2053     if (fMatchOpenParen > where) {
2054         fMatchOpenParen++;
2055     }
2056 }
2057 
2058 
2059 //------------------------------------------------------------------------------
2060 //
2061 //   allocateData()        Allocate storage in the matcher's static data area.
2062 //                         Return the index for the newly allocated data.
2063 //                         The storage won't actually exist until we are running a match
2064 //                         operation, but the storage indexes are inserted into various
2065 //                         opcodes while compiling the pattern.
2066 //
2067 //------------------------------------------------------------------------------
allocateData(int32_t size)2068 int32_t RegexCompile::allocateData(int32_t size) {
2069     if (U_FAILURE(*fStatus)) {
2070         return 0;
2071     }
2072     if (size <= 0 || size > 0x100 || fRXPat->fDataSize < 0) {
2073         error(U_REGEX_INTERNAL_ERROR);
2074         return 0;
2075     }
2076     int32_t dataIndex = fRXPat->fDataSize;
2077     fRXPat->fDataSize += size;
2078     if (fRXPat->fDataSize >= 0x00fffff0) {
2079         error(U_REGEX_INTERNAL_ERROR);
2080     }
2081     return dataIndex;
2082 }
2083 
2084 
2085 //------------------------------------------------------------------------------
2086 //
2087 //   allocateStackData()   Allocate space in the back-tracking stack frame.
2088 //                         Return the index for the newly allocated data.
2089 //                         The frame indexes are inserted into various
2090 //                         opcodes while compiling the pattern, meaning that frame
2091 //                         size must be restricted to the size that will fit
2092 //                         as an operand (24 bits).
2093 //
2094 //------------------------------------------------------------------------------
allocateStackData(int32_t size)2095 int32_t RegexCompile::allocateStackData(int32_t size) {
2096     if (U_FAILURE(*fStatus)) {
2097         return 0;
2098     }
2099     if (size <= 0 || size > 0x100 || fRXPat->fFrameSize < 0) {
2100         error(U_REGEX_INTERNAL_ERROR);
2101         return 0;
2102     }
2103     int32_t dataIndex = fRXPat->fFrameSize;
2104     fRXPat->fFrameSize += size;
2105     if (fRXPat->fFrameSize >= 0x00fffff0) {
2106         error(U_REGEX_PATTERN_TOO_BIG);
2107     }
2108     return dataIndex;
2109 }
2110 
2111 
2112 //------------------------------------------------------------------------------
2113 //
2114 //   blockTopLoc()          Find or create a location in the compiled pattern
2115 //                          at the start of the operation or block that has
2116 //                          just been compiled.  Needed when a quantifier (* or
2117 //                          whatever) appears, and we need to add an operation
2118 //                          at the start of the thing being quantified.
2119 //
2120 //                          (Parenthesized Blocks) have a slot with a NOP that
2121 //                          is reserved for this purpose.  .* or similar don't
2122 //                          and a slot needs to be added.
2123 //
2124 //       parameter reserveLoc   :  TRUE -  ensure that there is space to add an opcode
2125 //                                         at the returned location.
2126 //                                 FALSE - just return the address,
2127 //                                         do not reserve a location there.
2128 //
2129 //------------------------------------------------------------------------------
blockTopLoc(UBool reserveLoc)2130 int32_t   RegexCompile::blockTopLoc(UBool reserveLoc) {
2131     int32_t   theLoc;
2132     fixLiterals(TRUE);  // Emit code for any pending literals.
2133                         //   If last item was a string, emit separate op for the its last char.
2134     if (fRXPat->fCompiledPat->size() == fMatchCloseParen)
2135     {
2136         // The item just processed is a parenthesized block.
2137         theLoc = fMatchOpenParen;   // A slot is already reserved for us.
2138         U_ASSERT(theLoc > 0);
2139         U_ASSERT(URX_TYPE(((uint32_t)fRXPat->fCompiledPat->elementAti(theLoc))) == URX_NOP);
2140     }
2141     else {
2142         // Item just compiled is a single thing, a ".", or a single char, a string or a set reference.
2143         // No slot for STATE_SAVE was pre-reserved in the compiled code.
2144         // We need to make space now.
2145         theLoc = fRXPat->fCompiledPat->size()-1;
2146         int32_t opAtTheLoc = (int32_t)fRXPat->fCompiledPat->elementAti(theLoc);
2147         if (URX_TYPE(opAtTheLoc) == URX_STRING_LEN) {
2148             // Strings take two opcode, we want the position of the first one.
2149             // We can have a string at this point if a single character case-folded to two.
2150             theLoc--;
2151         }
2152         if (reserveLoc) {
2153             int32_t  nop = buildOp(URX_NOP, 0);
2154             fRXPat->fCompiledPat->insertElementAt(nop, theLoc, *fStatus);
2155         }
2156     }
2157     return theLoc;
2158 }
2159 
2160 
2161 
2162 //------------------------------------------------------------------------------
2163 //
2164 //    handleCloseParen      When compiling a close paren, we need to go back
2165 //                          and fix up any JMP or SAVE operations within the
2166 //                          parenthesized block that need to target the end
2167 //                          of the block.  The locations of these are kept on
2168 //                          the paretheses stack.
2169 //
2170 //                          This function is called both when encountering a
2171 //                          real ) and at the end of the pattern.
2172 //
2173 //------------------------------------------------------------------------------
handleCloseParen()2174 void  RegexCompile::handleCloseParen() {
2175     int32_t   patIdx;
2176     int32_t   patOp;
2177     if (fParenStack.size() <= 0) {
2178         error(U_REGEX_MISMATCHED_PAREN);
2179         return;
2180     }
2181 
2182     // Emit code for any pending literals.
2183     fixLiterals(FALSE);
2184 
2185     // Fixup any operations within the just-closed parenthesized group
2186     //    that need to reference the end of the (block).
2187     //    (The first one popped from the stack is an unused slot for
2188     //     alternation (OR) state save, but applying the fixup to it does no harm.)
2189     for (;;) {
2190         patIdx = fParenStack.popi();
2191         if (patIdx < 0) {
2192             // value < 0 flags the start of the frame on the paren stack.
2193             break;
2194         }
2195         U_ASSERT(patIdx>0 && patIdx <= fRXPat->fCompiledPat->size());
2196         patOp = (int32_t)fRXPat->fCompiledPat->elementAti(patIdx);
2197         U_ASSERT(URX_VAL(patOp) == 0);          // Branch target for JMP should not be set.
2198         patOp |= fRXPat->fCompiledPat->size();  // Set it now.
2199         fRXPat->fCompiledPat->setElementAt(patOp, patIdx);
2200         fMatchOpenParen     = patIdx;
2201     }
2202 
2203     //  At the close of any parenthesized block, restore the match mode flags  to
2204     //  the value they had at the open paren.  Saved value is
2205     //  at the top of the paren stack.
2206     fModeFlags = fParenStack.popi();
2207     U_ASSERT(fModeFlags < 0);
2208 
2209     // DO any additional fixups, depending on the specific kind of
2210     // parentesized grouping this is
2211 
2212     switch (patIdx) {
2213     case plain:
2214     case flags:
2215         // No additional fixups required.
2216         //   (Grouping-only parentheses)
2217         break;
2218     case capturing:
2219         // Capturing Parentheses.
2220         //   Insert a End Capture op into the pattern.
2221         //   The frame offset of the variables for this cg is obtained from the
2222         //       start capture op and put it into the end-capture op.
2223         {
2224             int32_t   captureOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen+1);
2225             U_ASSERT(URX_TYPE(captureOp) == URX_START_CAPTURE);
2226 
2227             int32_t   frameVarLocation = URX_VAL(captureOp);
2228             appendOp(URX_END_CAPTURE, frameVarLocation);
2229         }
2230         break;
2231     case atomic:
2232         // Atomic Parenthesis.
2233         //   Insert a LD_SP operation to restore the state stack to the position
2234         //   it was when the atomic parens were entered.
2235         {
2236             int32_t   stoOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen+1);
2237             U_ASSERT(URX_TYPE(stoOp) == URX_STO_SP);
2238             int32_t   stoLoc = URX_VAL(stoOp);
2239             appendOp(URX_LD_SP, stoLoc);
2240         }
2241         break;
2242 
2243     case lookAhead:
2244         {
2245             int32_t  startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-5);
2246             U_ASSERT(URX_TYPE(startOp) == URX_LA_START);
2247             int32_t dataLoc  = URX_VAL(startOp);
2248             appendOp(URX_LA_END, dataLoc);
2249         }
2250         break;
2251 
2252     case negLookAhead:
2253         {
2254             // See comment at doOpenLookAheadNeg
2255             int32_t  startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-1);
2256             U_ASSERT(URX_TYPE(startOp) == URX_LA_START);
2257             int32_t dataLoc  = URX_VAL(startOp);
2258             appendOp(URX_LA_END, dataLoc);
2259             appendOp(URX_BACKTRACK, 0);
2260             appendOp(URX_LA_END, dataLoc);
2261 
2262             // Patch the URX_SAVE near the top of the block.
2263             // The destination of the SAVE is the final LA_END that was just added.
2264             int32_t saveOp   = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen);
2265             U_ASSERT(URX_TYPE(saveOp) == URX_STATE_SAVE);
2266             int32_t dest     = fRXPat->fCompiledPat->size()-1;
2267             saveOp           = buildOp(URX_STATE_SAVE, dest);
2268             fRXPat->fCompiledPat->setElementAt(saveOp, fMatchOpenParen);
2269         }
2270         break;
2271 
2272     case lookBehind:
2273         {
2274             // See comment at doOpenLookBehind.
2275 
2276             // Append the URX_LB_END and URX_LA_END to the compiled pattern.
2277             int32_t  startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-4);
2278             U_ASSERT(URX_TYPE(startOp) == URX_LB_START);
2279             int32_t dataLoc  = URX_VAL(startOp);
2280             appendOp(URX_LB_END, dataLoc);
2281             appendOp(URX_LA_END, dataLoc);
2282 
2283             // Determine the min and max bounds for the length of the
2284             //  string that the pattern can match.
2285             //  An unbounded upper limit is an error.
2286             int32_t patEnd   = fRXPat->fCompiledPat->size() - 1;
2287             int32_t minML    = minMatchLength(fMatchOpenParen, patEnd);
2288             int32_t maxML    = maxMatchLength(fMatchOpenParen, patEnd);
2289             if (URX_TYPE(maxML) != 0) {
2290                 error(U_REGEX_LOOK_BEHIND_LIMIT);
2291                 break;
2292             }
2293             if (maxML == INT32_MAX) {
2294                 error(U_REGEX_LOOK_BEHIND_LIMIT);
2295                 break;
2296             }
2297             U_ASSERT(minML <= maxML);
2298 
2299             // Insert the min and max match len bounds into the URX_LB_CONT op that
2300             //  appears at the top of the look-behind block, at location fMatchOpenParen+1
2301             fRXPat->fCompiledPat->setElementAt(minML,  fMatchOpenParen-2);
2302             fRXPat->fCompiledPat->setElementAt(maxML,  fMatchOpenParen-1);
2303 
2304         }
2305         break;
2306 
2307 
2308 
2309     case lookBehindN:
2310         {
2311             // See comment at doOpenLookBehindNeg.
2312 
2313             // Append the URX_LBN_END to the compiled pattern.
2314             int32_t  startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-5);
2315             U_ASSERT(URX_TYPE(startOp) == URX_LB_START);
2316             int32_t dataLoc  = URX_VAL(startOp);
2317             appendOp(URX_LBN_END, dataLoc);
2318 
2319             // Determine the min and max bounds for the length of the
2320             //  string that the pattern can match.
2321             //  An unbounded upper limit is an error.
2322             int32_t patEnd   = fRXPat->fCompiledPat->size() - 1;
2323             int32_t minML    = minMatchLength(fMatchOpenParen, patEnd);
2324             int32_t maxML    = maxMatchLength(fMatchOpenParen, patEnd);
2325             if (URX_TYPE(maxML) != 0) {
2326                 error(U_REGEX_LOOK_BEHIND_LIMIT);
2327                 break;
2328             }
2329             if (maxML == INT32_MAX) {
2330                 error(U_REGEX_LOOK_BEHIND_LIMIT);
2331                 break;
2332             }
2333             U_ASSERT(minML <= maxML);
2334 
2335             // Insert the min and max match len bounds into the URX_LB_CONT op that
2336             //  appears at the top of the look-behind block, at location fMatchOpenParen+1
2337             fRXPat->fCompiledPat->setElementAt(minML,  fMatchOpenParen-3);
2338             fRXPat->fCompiledPat->setElementAt(maxML,  fMatchOpenParen-2);
2339 
2340             // Insert the pattern location to continue at after a successful match
2341             //  as the last operand of the URX_LBN_CONT
2342             int32_t op = buildOp(URX_RELOC_OPRND, fRXPat->fCompiledPat->size());
2343             fRXPat->fCompiledPat->setElementAt(op,  fMatchOpenParen-1);
2344         }
2345         break;
2346 
2347 
2348 
2349     default:
2350         U_ASSERT(FALSE);
2351     }
2352 
2353     // remember the next location in the compiled pattern.
2354     // The compilation of Quantifiers will look at this to see whether its looping
2355     //   over a parenthesized block or a single item
2356     fMatchCloseParen = fRXPat->fCompiledPat->size();
2357 }
2358 
2359 
2360 
2361 //------------------------------------------------------------------------------
2362 //
2363 //   compileSet       Compile the pattern operations for a reference to a
2364 //                    UnicodeSet.
2365 //
2366 //------------------------------------------------------------------------------
compileSet(UnicodeSet * theSet)2367 void        RegexCompile::compileSet(UnicodeSet *theSet)
2368 {
2369     if (theSet == NULL) {
2370         return;
2371     }
2372     //  Remove any strings from the set.
2373     //  There shoudn't be any, but just in case.
2374     //     (Case Closure can add them; if we had a simple case closure avaialble that
2375     //      ignored strings, that would be better.)
2376     theSet->removeAllStrings();
2377     int32_t  setSize = theSet->size();
2378 
2379     switch (setSize) {
2380     case 0:
2381         {
2382             // Set of no elements.   Always fails to match.
2383             appendOp(URX_BACKTRACK, 0);
2384             delete theSet;
2385         }
2386         break;
2387 
2388     case 1:
2389         {
2390             // The set contains only a single code point.  Put it into
2391             //   the compiled pattern as a single char operation rather
2392             //   than a set, and discard the set itself.
2393             literalChar(theSet->charAt(0));
2394             delete theSet;
2395         }
2396         break;
2397 
2398     default:
2399         {
2400             //  The set contains two or more chars.  (the normal case)
2401             //  Put it into the compiled pattern as a set.
2402             int32_t setNumber = fRXPat->fSets->size();
2403             fRXPat->fSets->addElement(theSet, *fStatus);
2404             appendOp(URX_SETREF, setNumber);
2405         }
2406     }
2407 }
2408 
2409 
2410 //------------------------------------------------------------------------------
2411 //
2412 //   compileInterval    Generate the code for a {min, max} style interval quantifier.
2413 //                      Except for the specific opcodes used, the code is the same
2414 //                      for all three types (greedy, non-greedy, possessive) of
2415 //                      intervals.  The opcodes are supplied as parameters.
2416 //                      (There are two sets of opcodes - greedy & possessive use the
2417 //                      same ones, while non-greedy has it's own.)
2418 //
2419 //                      The code for interval loops has this form:
2420 //                         0  CTR_INIT   counter loc (in stack frame)
2421 //                         1             5  patt address of CTR_LOOP at bottom of block
2422 //                         2             min count
2423 //                         3             max count   (-1 for unbounded)
2424 //                         4  ...        block to be iterated over
2425 //                         5  CTR_LOOP
2426 //
2427 //                       In
2428 //------------------------------------------------------------------------------
compileInterval(int32_t InitOp,int32_t LoopOp)2429 void        RegexCompile::compileInterval(int32_t InitOp,  int32_t LoopOp)
2430 {
2431     // The CTR_INIT op at the top of the block with the {n,m} quantifier takes
2432     //   four slots in the compiled code.  Reserve them.
2433     int32_t   topOfBlock = blockTopLoc(TRUE);
2434     insertOp(topOfBlock);
2435     insertOp(topOfBlock);
2436     insertOp(topOfBlock);
2437 
2438     // The operands for the CTR_INIT opcode include the index in the matcher data
2439     //   of the counter.  Allocate it now. There are two data items
2440     //        counterLoc   -->  Loop counter
2441     //               +1    -->  Input index (for breaking non-progressing loops)
2442     //                          (Only present if unbounded upper limit on loop)
2443     int32_t   dataSize = fIntervalUpper < 0 ? 2 : 1;
2444     int32_t   counterLoc = allocateStackData(dataSize);
2445 
2446     int32_t   op = buildOp(InitOp, counterLoc);
2447     fRXPat->fCompiledPat->setElementAt(op, topOfBlock);
2448 
2449     // The second operand of CTR_INIT is the location following the end of the loop.
2450     //   Must put in as a URX_RELOC_OPRND so that the value will be adjusted if the
2451     //   compilation of something later on causes the code to grow and the target
2452     //   position to move.
2453     int32_t loopEnd = fRXPat->fCompiledPat->size();
2454     op = buildOp(URX_RELOC_OPRND, loopEnd);
2455     fRXPat->fCompiledPat->setElementAt(op, topOfBlock+1);
2456 
2457     // Followed by the min and max counts.
2458     fRXPat->fCompiledPat->setElementAt(fIntervalLow, topOfBlock+2);
2459     fRXPat->fCompiledPat->setElementAt(fIntervalUpper, topOfBlock+3);
2460 
2461     // Apend the CTR_LOOP op.  The operand is the location of the CTR_INIT op.
2462     //   Goes at end of the block being looped over, so just append to the code so far.
2463     appendOp(LoopOp, topOfBlock);
2464 
2465     if ((fIntervalLow & 0xff000000) != 0 ||
2466         (fIntervalUpper > 0 && (fIntervalUpper & 0xff000000) != 0)) {
2467             error(U_REGEX_NUMBER_TOO_BIG);
2468         }
2469 
2470     if (fIntervalLow > fIntervalUpper && fIntervalUpper != -1) {
2471         error(U_REGEX_MAX_LT_MIN);
2472     }
2473 }
2474 
2475 
2476 
compileInlineInterval()2477 UBool RegexCompile::compileInlineInterval() {
2478     if (fIntervalUpper > 10 || fIntervalUpper < fIntervalLow) {
2479         // Too big to inline.  Fail, which will cause looping code to be generated.
2480         //   (Upper < Lower picks up unbounded upper and errors, both.)
2481         return FALSE;
2482     }
2483 
2484     int32_t   topOfBlock = blockTopLoc(FALSE);
2485     if (fIntervalUpper == 0) {
2486         // Pathological case.  Attempt no matches, as if the block doesn't exist.
2487         // Discard the generated code for the block.
2488         // If the block included parens, discard the info pertaining to them as well.
2489         fRXPat->fCompiledPat->setSize(topOfBlock);
2490         if (fMatchOpenParen >= topOfBlock) {
2491             fMatchOpenParen = -1;
2492         }
2493         if (fMatchCloseParen >= topOfBlock) {
2494             fMatchCloseParen = -1;
2495         }
2496         return TRUE;
2497     }
2498 
2499     if (topOfBlock != fRXPat->fCompiledPat->size()-1 && fIntervalUpper != 1) {
2500         // The thing being repeated is not a single op, but some
2501         //   more complex block.  Do it as a loop, not inlines.
2502         //   Note that things "repeated" a max of once are handled as inline, because
2503         //     the one copy of the code already generated is just fine.
2504         return FALSE;
2505     }
2506 
2507     // Pick up the opcode that is to be repeated
2508     //
2509     int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(topOfBlock);
2510 
2511     // Compute the pattern location where the inline sequence
2512     //   will end, and set up the state save op that will be needed.
2513     //
2514     int32_t endOfSequenceLoc = fRXPat->fCompiledPat->size()-1
2515                                 + fIntervalUpper + (fIntervalUpper-fIntervalLow);
2516     int32_t saveOp = buildOp(URX_STATE_SAVE, endOfSequenceLoc);
2517     if (fIntervalLow == 0) {
2518         insertOp(topOfBlock);
2519         fRXPat->fCompiledPat->setElementAt(saveOp, topOfBlock);
2520     }
2521 
2522 
2523 
2524     //  Loop, emitting the op for the thing being repeated each time.
2525     //    Loop starts at 1 because one instance of the op already exists in the pattern,
2526     //    it was put there when it was originally encountered.
2527     int32_t i;
2528     for (i=1; i<fIntervalUpper; i++ ) {
2529         if (i >= fIntervalLow) {
2530             appendOp(saveOp);
2531         }
2532         appendOp(op);
2533     }
2534     return TRUE;
2535 }
2536 
2537 
2538 
2539 //------------------------------------------------------------------------------
2540 //
2541 //   caseInsensitiveStart  given a single code point from a pattern string, determine the
2542 //                         set of characters that could potentially begin a case-insensitive
2543 //                         match of a string beginning with that character, using full Unicode
2544 //                         case insensitive matching.
2545 //
2546 //          This is used in optimizing find().
2547 //
2548 //          closeOver(USET_CASE_INSENSITIVE) does most of what is needed, but
2549 //          misses cases like this:
2550 //             A string from the pattern begins with 'ss' (although all we know
2551 //                 in this context is that it begins with 's')
2552 //             The pattern could match a string beginning with a German sharp-s
2553 //
2554 //           To the ordinary case closure for a character c, we add all other
2555 //           characters cx where the case closure of cx incudes a string form that begins
2556 //           with the original character c.
2557 //
2558 //           This function could be made smarter. The full pattern string is available
2559 //           and it would be possible to verify that the extra characters being added
2560 //           to the starting set fully match, rather than having just a first-char of the
2561 //           folded form match.
2562 //
2563 //------------------------------------------------------------------------------
findCaseInsensitiveStarters(UChar32 c,UnicodeSet * starterChars)2564 void  RegexCompile::findCaseInsensitiveStarters(UChar32 c, UnicodeSet *starterChars) {
2565 
2566 // Machine Generated below.
2567 // It may need updating with new versions of Unicode.
2568 // Intltest test RegexTest::TestCaseInsensitiveStarters will fail if an update is needed.
2569 // The update tool is here: svn+ssh://source.icu-project.org/repos/icu/tools/trunk/unicode/c/genregexcasing
2570 
2571 // Machine Generated Data. Do not hand edit.
2572     static const UChar32 RECaseFixCodePoints[] = {
2573         0x61, 0x66, 0x68, 0x69, 0x6a, 0x73, 0x74, 0x77, 0x79, 0x2bc,
2574         0x3ac, 0x3ae, 0x3b1, 0x3b7, 0x3b9, 0x3c1, 0x3c5, 0x3c9, 0x3ce, 0x565,
2575         0x574, 0x57e, 0x1f00, 0x1f01, 0x1f02, 0x1f03, 0x1f04, 0x1f05, 0x1f06, 0x1f07,
2576         0x1f20, 0x1f21, 0x1f22, 0x1f23, 0x1f24, 0x1f25, 0x1f26, 0x1f27, 0x1f60, 0x1f61,
2577         0x1f62, 0x1f63, 0x1f64, 0x1f65, 0x1f66, 0x1f67, 0x1f70, 0x1f74, 0x1f7c, 0x110000};
2578 
2579     static const int16_t RECaseFixStringOffsets[] = {
2580         0x0, 0x1, 0x6, 0x7, 0x8, 0x9, 0xd, 0xe, 0xf, 0x10,
2581         0x11, 0x12, 0x13, 0x17, 0x1b, 0x20, 0x21, 0x2a, 0x2e, 0x2f,
2582         0x30, 0x34, 0x35, 0x37, 0x39, 0x3b, 0x3d, 0x3f, 0x41, 0x43,
2583         0x45, 0x47, 0x49, 0x4b, 0x4d, 0x4f, 0x51, 0x53, 0x55, 0x57,
2584         0x59, 0x5b, 0x5d, 0x5f, 0x61, 0x63, 0x65, 0x66, 0x67, 0};
2585 
2586     static const int16_t RECaseFixCounts[] = {
2587         0x1, 0x5, 0x1, 0x1, 0x1, 0x4, 0x1, 0x1, 0x1, 0x1,
2588         0x1, 0x1, 0x4, 0x4, 0x5, 0x1, 0x9, 0x4, 0x1, 0x1,
2589         0x4, 0x1, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2,
2590         0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2,
2591         0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x1, 0x1, 0x1, 0};
2592 
2593     static const UChar RECaseFixData[] = {
2594         0x1e9a, 0xfb00, 0xfb01, 0xfb02, 0xfb03, 0xfb04, 0x1e96, 0x130, 0x1f0, 0xdf,
2595         0x1e9e, 0xfb05, 0xfb06, 0x1e97, 0x1e98, 0x1e99, 0x149, 0x1fb4, 0x1fc4, 0x1fb3,
2596         0x1fb6, 0x1fb7, 0x1fbc, 0x1fc3, 0x1fc6, 0x1fc7, 0x1fcc, 0x390, 0x1fd2, 0x1fd3,
2597         0x1fd6, 0x1fd7, 0x1fe4, 0x3b0, 0x1f50, 0x1f52, 0x1f54, 0x1f56, 0x1fe2, 0x1fe3,
2598         0x1fe6, 0x1fe7, 0x1ff3, 0x1ff6, 0x1ff7, 0x1ffc, 0x1ff4, 0x587, 0xfb13, 0xfb14,
2599         0xfb15, 0xfb17, 0xfb16, 0x1f80, 0x1f88, 0x1f81, 0x1f89, 0x1f82, 0x1f8a, 0x1f83,
2600         0x1f8b, 0x1f84, 0x1f8c, 0x1f85, 0x1f8d, 0x1f86, 0x1f8e, 0x1f87, 0x1f8f, 0x1f90,
2601         0x1f98, 0x1f91, 0x1f99, 0x1f92, 0x1f9a, 0x1f93, 0x1f9b, 0x1f94, 0x1f9c, 0x1f95,
2602         0x1f9d, 0x1f96, 0x1f9e, 0x1f97, 0x1f9f, 0x1fa0, 0x1fa8, 0x1fa1, 0x1fa9, 0x1fa2,
2603         0x1faa, 0x1fa3, 0x1fab, 0x1fa4, 0x1fac, 0x1fa5, 0x1fad, 0x1fa6, 0x1fae, 0x1fa7,
2604         0x1faf, 0x1fb2, 0x1fc2, 0x1ff2, 0};
2605 
2606 // End of machine generated data.
2607 
2608     if (c < UCHAR_MIN_VALUE || c > UCHAR_MAX_VALUE) {
2609         // This function should never be called with an invalid input character.
2610         U_ASSERT(FALSE);
2611         starterChars->clear();
2612     } else if (u_hasBinaryProperty(c, UCHAR_CASE_SENSITIVE)) {
2613         UChar32 caseFoldedC  = u_foldCase(c, U_FOLD_CASE_DEFAULT);
2614         starterChars->set(caseFoldedC, caseFoldedC);
2615 
2616         int32_t i;
2617         for (i=0; RECaseFixCodePoints[i]<c ; i++) {
2618             // Simple linear search through the sorted list of interesting code points.
2619         }
2620 
2621         if (RECaseFixCodePoints[i] == c) {
2622             int32_t dataIndex = RECaseFixStringOffsets[i];
2623             int32_t numCharsToAdd = RECaseFixCounts[i];
2624             UChar32 cpToAdd = 0;
2625             for (int32_t j=0; j<numCharsToAdd; j++) {
2626                 U16_NEXT_UNSAFE(RECaseFixData, dataIndex, cpToAdd);
2627                 starterChars->add(cpToAdd);
2628             }
2629         }
2630 
2631         starterChars->closeOver(USET_CASE_INSENSITIVE);
2632         starterChars->removeAllStrings();
2633     } else {
2634         // Not a cased character. Just return it alone.
2635         starterChars->set(c, c);
2636     }
2637 }
2638 
2639 
2640 
2641 
2642 //------------------------------------------------------------------------------
2643 //
2644 //   matchStartType    Determine how a match can start.
2645 //                     Used to optimize find() operations.
2646 //
2647 //                     Operation is very similar to minMatchLength().  Walk the compiled
2648 //                     pattern, keeping an on-going minimum-match-length.  For any
2649 //                     op where the min match coming in is zero, add that ops possible
2650 //                     starting matches to the possible starts for the overall pattern.
2651 //
2652 //------------------------------------------------------------------------------
matchStartType()2653 void   RegexCompile::matchStartType() {
2654     if (U_FAILURE(*fStatus)) {
2655         return;
2656     }
2657 
2658 
2659     int32_t    loc;                    // Location in the pattern of the current op being processed.
2660     int32_t    op;                     // The op being processed
2661     int32_t    opType;                 // The opcode type of the op
2662     int32_t    currentLen = 0;         // Minimum length of a match to this point (loc) in the pattern
2663     int32_t    numInitialStrings = 0;  // Number of strings encountered that could match at start.
2664 
2665     UBool      atStart = TRUE;         // True if no part of the pattern yet encountered
2666                                        //   could have advanced the position in a match.
2667                                        //   (Maximum match length so far == 0)
2668 
2669     // forwardedLength is a vector holding minimum-match-length values that
2670     //   are propagated forward in the pattern by JMP or STATE_SAVE operations.
2671     //   It must be one longer than the pattern being checked because some  ops
2672     //   will jmp to a end-of-block+1 location from within a block, and we must
2673     //   count those when checking the block.
2674     int32_t end = fRXPat->fCompiledPat->size();
2675     UVector32  forwardedLength(end+1, *fStatus);
2676     forwardedLength.setSize(end+1);
2677     for (loc=3; loc<end; loc++) {
2678         forwardedLength.setElementAt(INT32_MAX, loc);
2679     }
2680 
2681     for (loc = 3; loc<end; loc++) {
2682         op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
2683         opType = URX_TYPE(op);
2684 
2685         // The loop is advancing linearly through the pattern.
2686         // If the op we are now at was the destination of a branch in the pattern,
2687         // and that path has a shorter minimum length than the current accumulated value,
2688         // replace the current accumulated value.
2689         if (forwardedLength.elementAti(loc) < currentLen) {
2690             currentLen = forwardedLength.elementAti(loc);
2691             U_ASSERT(currentLen>=0 && currentLen < INT32_MAX);
2692         }
2693 
2694         switch (opType) {
2695             // Ops that don't change the total length matched
2696         case URX_RESERVED_OP:
2697         case URX_END:
2698         case URX_FAIL:
2699         case URX_STRING_LEN:
2700         case URX_NOP:
2701         case URX_START_CAPTURE:
2702         case URX_END_CAPTURE:
2703         case URX_BACKSLASH_B:
2704         case URX_BACKSLASH_BU:
2705         case URX_BACKSLASH_G:
2706         case URX_BACKSLASH_Z:
2707         case URX_DOLLAR:
2708         case URX_DOLLAR_M:
2709         case URX_DOLLAR_D:
2710         case URX_DOLLAR_MD:
2711         case URX_RELOC_OPRND:
2712         case URX_STO_INP_LOC:
2713         case URX_BACKREF:         // BackRef.  Must assume that it might be a zero length match
2714         case URX_BACKREF_I:
2715 
2716         case URX_STO_SP:          // Setup for atomic or possessive blocks.  Doesn't change what can match.
2717         case URX_LD_SP:
2718             break;
2719 
2720         case URX_CARET:
2721             if (atStart) {
2722                 fRXPat->fStartType = START_START;
2723             }
2724             break;
2725 
2726         case URX_CARET_M:
2727         case URX_CARET_M_UNIX:
2728             if (atStart) {
2729                 fRXPat->fStartType = START_LINE;
2730             }
2731             break;
2732 
2733         case URX_ONECHAR:
2734             if (currentLen == 0) {
2735                 // This character could appear at the start of a match.
2736                 //   Add it to the set of possible starting characters.
2737                 fRXPat->fInitialChars->add(URX_VAL(op));
2738                 numInitialStrings += 2;
2739             }
2740             currentLen++;
2741             atStart = FALSE;
2742             break;
2743 
2744 
2745         case URX_SETREF:
2746             if (currentLen == 0) {
2747                 int32_t  sn = URX_VAL(op);
2748                 U_ASSERT(sn > 0 && sn < fRXPat->fSets->size());
2749                 const UnicodeSet *s = (UnicodeSet *)fRXPat->fSets->elementAt(sn);
2750                 fRXPat->fInitialChars->addAll(*s);
2751                 numInitialStrings += 2;
2752             }
2753             currentLen++;
2754             atStart = FALSE;
2755             break;
2756 
2757         case URX_LOOP_SR_I:
2758             // [Set]*, like a SETREF, above, in what it can match,
2759             //  but may not match at all, so currentLen is not incremented.
2760             if (currentLen == 0) {
2761                 int32_t  sn = URX_VAL(op);
2762                 U_ASSERT(sn > 0 && sn < fRXPat->fSets->size());
2763                 const UnicodeSet *s = (UnicodeSet *)fRXPat->fSets->elementAt(sn);
2764                 fRXPat->fInitialChars->addAll(*s);
2765                 numInitialStrings += 2;
2766             }
2767             atStart = FALSE;
2768             break;
2769 
2770         case URX_LOOP_DOT_I:
2771             if (currentLen == 0) {
2772                 // .* at the start of a pattern.
2773                 //    Any character can begin the match.
2774                 fRXPat->fInitialChars->clear();
2775                 fRXPat->fInitialChars->complement();
2776                 numInitialStrings += 2;
2777             }
2778             atStart = FALSE;
2779             break;
2780 
2781 
2782         case URX_STATIC_SETREF:
2783             if (currentLen == 0) {
2784                 int32_t  sn = URX_VAL(op);
2785                 U_ASSERT(sn>0 && sn<URX_LAST_SET);
2786                 const UnicodeSet *s = fRXPat->fStaticSets[sn];
2787                 fRXPat->fInitialChars->addAll(*s);
2788                 numInitialStrings += 2;
2789             }
2790             currentLen++;
2791             atStart = FALSE;
2792             break;
2793 
2794 
2795 
2796         case URX_STAT_SETREF_N:
2797             if (currentLen == 0) {
2798                 int32_t  sn = URX_VAL(op);
2799                 const UnicodeSet *s = fRXPat->fStaticSets[sn];
2800                 UnicodeSet sc(*s);
2801                 sc.complement();
2802                 fRXPat->fInitialChars->addAll(sc);
2803                 numInitialStrings += 2;
2804             }
2805             currentLen++;
2806             atStart = FALSE;
2807             break;
2808 
2809 
2810 
2811         case URX_BACKSLASH_D:
2812             // Digit Char
2813              if (currentLen == 0) {
2814                  UnicodeSet s;
2815                  s.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ND_MASK, *fStatus);
2816                  if (URX_VAL(op) != 0) {
2817                      s.complement();
2818                  }
2819                  fRXPat->fInitialChars->addAll(s);
2820                  numInitialStrings += 2;
2821             }
2822             currentLen++;
2823             atStart = FALSE;
2824             break;
2825 
2826 
2827         case URX_BACKSLASH_H:
2828             // Horiz white space
2829             if (currentLen == 0) {
2830                 UnicodeSet s;
2831                 s.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ZS_MASK, *fStatus);
2832                 s.add((UChar32)9);   // Tab
2833                 if (URX_VAL(op) != 0) {
2834                     s.complement();
2835                 }
2836                 fRXPat->fInitialChars->addAll(s);
2837                 numInitialStrings += 2;
2838             }
2839             currentLen++;
2840             atStart = FALSE;
2841             break;
2842 
2843 
2844         case URX_BACKSLASH_R:       // Any line ending sequence
2845         case URX_BACKSLASH_V:       // Any line ending code point, with optional negation
2846             if (currentLen == 0) {
2847                 UnicodeSet s;
2848                 s.add((UChar32)0x0a, (UChar32)0x0d);  // add range
2849                 s.add((UChar32)0x85);
2850                 s.add((UChar32)0x2028, (UChar32)0x2029);
2851                 if (URX_VAL(op) != 0) {
2852                      // Complement option applies to URX_BACKSLASH_V only.
2853                      s.complement();
2854                 }
2855                 fRXPat->fInitialChars->addAll(s);
2856                 numInitialStrings += 2;
2857             }
2858             currentLen++;
2859             atStart = FALSE;
2860             break;
2861 
2862 
2863 
2864         case URX_ONECHAR_I:
2865             // Case Insensitive Single Character.
2866             if (currentLen == 0) {
2867                 UChar32  c = URX_VAL(op);
2868                 if (u_hasBinaryProperty(c, UCHAR_CASE_SENSITIVE)) {
2869                     UnicodeSet starters(c, c);
2870                     starters.closeOver(USET_CASE_INSENSITIVE);
2871                     // findCaseInsensitiveStarters(c, &starters);
2872                     //   For ONECHAR_I, no need to worry about text chars that expand on folding into strings.
2873                     //   The expanded folding can't match the pattern.
2874                     fRXPat->fInitialChars->addAll(starters);
2875                 } else {
2876                     // Char has no case variants.  Just add it as-is to the
2877                     //   set of possible starting chars.
2878                     fRXPat->fInitialChars->add(c);
2879                 }
2880                 numInitialStrings += 2;
2881             }
2882             currentLen++;
2883             atStart = FALSE;
2884             break;
2885 
2886 
2887         case URX_BACKSLASH_X:   // Grahpeme Cluster.  Minimum is 1, max unbounded.
2888         case URX_DOTANY_ALL:    // . matches one or two.
2889         case URX_DOTANY:
2890         case URX_DOTANY_UNIX:
2891             if (currentLen == 0) {
2892                 // These constructs are all bad news when they appear at the start
2893                 //   of a match.  Any character can begin the match.
2894                 fRXPat->fInitialChars->clear();
2895                 fRXPat->fInitialChars->complement();
2896                 numInitialStrings += 2;
2897             }
2898             currentLen++;
2899             atStart = FALSE;
2900             break;
2901 
2902 
2903         case URX_JMPX:
2904             loc++;             // Except for extra operand on URX_JMPX, same as URX_JMP.
2905             U_FALLTHROUGH;
2906         case URX_JMP:
2907             {
2908                 int32_t  jmpDest = URX_VAL(op);
2909                 if (jmpDest < loc) {
2910                     // Loop of some kind.  Can safely ignore, the worst that will happen
2911                     //  is that we understate the true minimum length
2912                     currentLen = forwardedLength.elementAti(loc+1);
2913 
2914                 } else {
2915                     // Forward jump.  Propagate the current min length to the target loc of the jump.
2916                     U_ASSERT(jmpDest <= end+1);
2917                     if (forwardedLength.elementAti(jmpDest) > currentLen) {
2918                         forwardedLength.setElementAt(currentLen, jmpDest);
2919                     }
2920                 }
2921             }
2922             atStart = FALSE;
2923             break;
2924 
2925         case URX_JMP_SAV:
2926         case URX_JMP_SAV_X:
2927             // Combo of state save to the next loc, + jmp backwards.
2928             //   Net effect on min. length computation is nothing.
2929             atStart = FALSE;
2930             break;
2931 
2932         case URX_BACKTRACK:
2933             // Fails are kind of like a branch, except that the min length was
2934             //   propagated already, by the state save.
2935             currentLen = forwardedLength.elementAti(loc+1);
2936             atStart = FALSE;
2937             break;
2938 
2939 
2940         case URX_STATE_SAVE:
2941             {
2942                 // State Save, for forward jumps, propagate the current minimum.
2943                 //             of the state save.
2944                 int32_t  jmpDest = URX_VAL(op);
2945                 if (jmpDest > loc) {
2946                     if (currentLen < forwardedLength.elementAti(jmpDest)) {
2947                         forwardedLength.setElementAt(currentLen, jmpDest);
2948                     }
2949                 }
2950             }
2951             atStart = FALSE;
2952             break;
2953 
2954 
2955 
2956 
2957         case URX_STRING:
2958             {
2959                 loc++;
2960                 int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
2961                 int32_t stringLen   = URX_VAL(stringLenOp);
2962                 U_ASSERT(URX_TYPE(stringLenOp) == URX_STRING_LEN);
2963                 U_ASSERT(stringLenOp >= 2);
2964                 if (currentLen == 0) {
2965                     // Add the starting character of this string to the set of possible starting
2966                     //   characters for this pattern.
2967                     int32_t stringStartIdx = URX_VAL(op);
2968                     UChar32  c = fRXPat->fLiteralText.char32At(stringStartIdx);
2969                     fRXPat->fInitialChars->add(c);
2970 
2971                     // Remember this string.  After the entire pattern has been checked,
2972                     //  if nothing else is identified that can start a match, we'll use it.
2973                     numInitialStrings++;
2974                     fRXPat->fInitialStringIdx = stringStartIdx;
2975                     fRXPat->fInitialStringLen = stringLen;
2976                 }
2977 
2978                 currentLen += stringLen;
2979                 atStart = FALSE;
2980             }
2981             break;
2982 
2983         case URX_STRING_I:
2984             {
2985                 // Case-insensitive string.  Unlike exact-match strings, we won't
2986                 //   attempt a string search for possible match positions.  But we
2987                 //   do update the set of possible starting characters.
2988                 loc++;
2989                 int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
2990                 int32_t stringLen   = URX_VAL(stringLenOp);
2991                 U_ASSERT(URX_TYPE(stringLenOp) == URX_STRING_LEN);
2992                 U_ASSERT(stringLenOp >= 2);
2993                 if (currentLen == 0) {
2994                     // Add the starting character of this string to the set of possible starting
2995                     //   characters for this pattern.
2996                     int32_t stringStartIdx = URX_VAL(op);
2997                     UChar32  c = fRXPat->fLiteralText.char32At(stringStartIdx);
2998                     UnicodeSet s;
2999                     findCaseInsensitiveStarters(c, &s);
3000                     fRXPat->fInitialChars->addAll(s);
3001                     numInitialStrings += 2;  // Matching on an initial string not possible.
3002                 }
3003                 currentLen += stringLen;
3004                 atStart = FALSE;
3005             }
3006             break;
3007 
3008         case URX_CTR_INIT:
3009         case URX_CTR_INIT_NG:
3010             {
3011                 // Loop Init Ops.  These don't change the min length, but they are 4 word ops
3012                 //   so location must be updated accordingly.
3013                 // Loop Init Ops.
3014                 //   If the min loop count == 0
3015                 //      move loc forwards to the end of the loop, skipping over the body.
3016                 //   If the min count is > 0,
3017                 //      continue normal processing of the body of the loop.
3018                 int32_t loopEndLoc   = (int32_t)fRXPat->fCompiledPat->elementAti(loc+1);
3019                         loopEndLoc   = URX_VAL(loopEndLoc);
3020                 int32_t minLoopCount = (int32_t)fRXPat->fCompiledPat->elementAti(loc+2);
3021                 if (minLoopCount == 0) {
3022                     // Min Loop Count of 0, treat like a forward branch and
3023                     //   move the current minimum length up to the target
3024                     //   (end of loop) location.
3025                     U_ASSERT(loopEndLoc <= end+1);
3026                     if (forwardedLength.elementAti(loopEndLoc) > currentLen) {
3027                         forwardedLength.setElementAt(currentLen, loopEndLoc);
3028                     }
3029                 }
3030                 loc+=3;  // Skips over operands of CTR_INIT
3031             }
3032             atStart = FALSE;
3033             break;
3034 
3035 
3036         case URX_CTR_LOOP:
3037         case URX_CTR_LOOP_NG:
3038             // Loop ops.
3039             //  The jump is conditional, backwards only.
3040             atStart = FALSE;
3041             break;
3042 
3043         case URX_LOOP_C:
3044             // More loop ops.  These state-save to themselves.
3045             //   don't change the minimum match
3046             atStart = FALSE;
3047             break;
3048 
3049 
3050         case URX_LA_START:
3051         case URX_LB_START:
3052             {
3053                 // Look-around.  Scan forward until the matching look-ahead end,
3054                 //   without processing the look-around block.  This is overly pessimistic.
3055 
3056                 // Keep track of the nesting depth of look-around blocks.  Boilerplate code for
3057                 //   lookahead contains two LA_END instructions, so count goes up by two
3058                 //   for each LA_START.
3059                 int32_t  depth = (opType == URX_LA_START? 2: 1);
3060                 for (;;) {
3061                     loc++;
3062                     op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3063                     if (URX_TYPE(op) == URX_LA_START) {
3064                         depth+=2;
3065                     }
3066                     if (URX_TYPE(op) == URX_LB_START) {
3067                         depth++;
3068                     }
3069                     if (URX_TYPE(op) == URX_LA_END || URX_TYPE(op)==URX_LBN_END) {
3070                         depth--;
3071                         if (depth == 0) {
3072                             break;
3073                         }
3074                     }
3075                     if (URX_TYPE(op) == URX_STATE_SAVE) {
3076                         // Need this because neg lookahead blocks will FAIL to outside
3077                         //   of the block.
3078                         int32_t  jmpDest = URX_VAL(op);
3079                         if (jmpDest > loc) {
3080                             if (currentLen < forwardedLength.elementAti(jmpDest)) {
3081                                 forwardedLength.setElementAt(currentLen, jmpDest);
3082                             }
3083                         }
3084                     }
3085                     U_ASSERT(loc <= end);
3086                 }
3087             }
3088             break;
3089 
3090         case URX_LA_END:
3091         case URX_LB_CONT:
3092         case URX_LB_END:
3093         case URX_LBN_CONT:
3094         case URX_LBN_END:
3095             U_ASSERT(FALSE);     // Shouldn't get here.  These ops should be
3096                                  //  consumed by the scan in URX_LA_START and LB_START
3097 
3098             break;
3099 
3100         default:
3101             U_ASSERT(FALSE);
3102             }
3103 
3104         }
3105 
3106 
3107     // We have finished walking through the ops.  Check whether some forward jump
3108     //   propagated a shorter length to location end+1.
3109     if (forwardedLength.elementAti(end+1) < currentLen) {
3110         currentLen = forwardedLength.elementAti(end+1);
3111     }
3112 
3113 
3114     fRXPat->fInitialChars8->init(fRXPat->fInitialChars);
3115 
3116 
3117     // Sort out what we should check for when looking for candidate match start positions.
3118     // In order of preference,
3119     //     1.   Start of input text buffer.
3120     //     2.   A literal string.
3121     //     3.   Start of line in multi-line mode.
3122     //     4.   A single literal character.
3123     //     5.   A character from a set of characters.
3124     //
3125     if (fRXPat->fStartType == START_START) {
3126         // Match only at the start of an input text string.
3127         //    start type is already set.  We're done.
3128     } else if (numInitialStrings == 1 && fRXPat->fMinMatchLen > 0) {
3129         // Match beginning only with a literal string.
3130         UChar32  c = fRXPat->fLiteralText.char32At(fRXPat->fInitialStringIdx);
3131         U_ASSERT(fRXPat->fInitialChars->contains(c));
3132         fRXPat->fStartType   = START_STRING;
3133         fRXPat->fInitialChar = c;
3134     } else if (fRXPat->fStartType == START_LINE) {
3135         // Match at start of line in Multi-Line mode.
3136         // Nothing to do here; everything is already set.
3137     } else if (fRXPat->fMinMatchLen == 0) {
3138         // Zero length match possible.  We could start anywhere.
3139         fRXPat->fStartType = START_NO_INFO;
3140     } else if (fRXPat->fInitialChars->size() == 1) {
3141         // All matches begin with the same char.
3142         fRXPat->fStartType   = START_CHAR;
3143         fRXPat->fInitialChar = fRXPat->fInitialChars->charAt(0);
3144         U_ASSERT(fRXPat->fInitialChar != (UChar32)-1);
3145     } else if (fRXPat->fInitialChars->contains((UChar32)0, (UChar32)0x10ffff) == FALSE &&
3146         fRXPat->fMinMatchLen > 0) {
3147         // Matches start with a set of character smaller than the set of all chars.
3148         fRXPat->fStartType = START_SET;
3149     } else {
3150         // Matches can start with anything
3151         fRXPat->fStartType = START_NO_INFO;
3152     }
3153 
3154     return;
3155 }
3156 
3157 
3158 
3159 //------------------------------------------------------------------------------
3160 //
3161 //   minMatchLength    Calculate the length of the shortest string that could
3162 //                     match the specified pattern.
3163 //                     Length is in 16 bit code units, not code points.
3164 //
3165 //                     The calculated length may not be exact.  The returned
3166 //                     value may be shorter than the actual minimum; it must
3167 //                     never be longer.
3168 //
3169 //                     start and end are the range of p-code operations to be
3170 //                     examined.  The endpoints are included in the range.
3171 //
3172 //------------------------------------------------------------------------------
minMatchLength(int32_t start,int32_t end)3173 int32_t   RegexCompile::minMatchLength(int32_t start, int32_t end) {
3174     if (U_FAILURE(*fStatus)) {
3175         return 0;
3176     }
3177 
3178     U_ASSERT(start <= end);
3179     U_ASSERT(end < fRXPat->fCompiledPat->size());
3180 
3181 
3182     int32_t    loc;
3183     int32_t    op;
3184     int32_t    opType;
3185     int32_t    currentLen = 0;
3186 
3187 
3188     // forwardedLength is a vector holding minimum-match-length values that
3189     //   are propagated forward in the pattern by JMP or STATE_SAVE operations.
3190     //   It must be one longer than the pattern being checked because some  ops
3191     //   will jmp to a end-of-block+1 location from within a block, and we must
3192     //   count those when checking the block.
3193     UVector32  forwardedLength(end+2, *fStatus);
3194     forwardedLength.setSize(end+2);
3195     for (loc=start; loc<=end+1; loc++) {
3196         forwardedLength.setElementAt(INT32_MAX, loc);
3197     }
3198 
3199     for (loc = start; loc<=end; loc++) {
3200         op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3201         opType = URX_TYPE(op);
3202 
3203         // The loop is advancing linearly through the pattern.
3204         // If the op we are now at was the destination of a branch in the pattern,
3205         // and that path has a shorter minimum length than the current accumulated value,
3206         // replace the current accumulated value.
3207         // U_ASSERT(currentLen>=0 && currentLen < INT32_MAX);  // MinLength == INT32_MAX for some
3208                                                                //   no-match-possible cases.
3209         if (forwardedLength.elementAti(loc) < currentLen) {
3210             currentLen = forwardedLength.elementAti(loc);
3211             U_ASSERT(currentLen>=0 && currentLen < INT32_MAX);
3212         }
3213 
3214         switch (opType) {
3215             // Ops that don't change the total length matched
3216         case URX_RESERVED_OP:
3217         case URX_END:
3218         case URX_STRING_LEN:
3219         case URX_NOP:
3220         case URX_START_CAPTURE:
3221         case URX_END_CAPTURE:
3222         case URX_BACKSLASH_B:
3223         case URX_BACKSLASH_BU:
3224         case URX_BACKSLASH_G:
3225         case URX_BACKSLASH_Z:
3226         case URX_CARET:
3227         case URX_DOLLAR:
3228         case URX_DOLLAR_M:
3229         case URX_DOLLAR_D:
3230         case URX_DOLLAR_MD:
3231         case URX_RELOC_OPRND:
3232         case URX_STO_INP_LOC:
3233         case URX_CARET_M:
3234         case URX_CARET_M_UNIX:
3235         case URX_BACKREF:         // BackRef.  Must assume that it might be a zero length match
3236         case URX_BACKREF_I:
3237 
3238         case URX_STO_SP:          // Setup for atomic or possessive blocks.  Doesn't change what can match.
3239         case URX_LD_SP:
3240 
3241         case URX_JMP_SAV:
3242         case URX_JMP_SAV_X:
3243             break;
3244 
3245 
3246             // Ops that match a minimum of one character (one or two 16 bit code units.)
3247             //
3248         case URX_ONECHAR:
3249         case URX_STATIC_SETREF:
3250         case URX_STAT_SETREF_N:
3251         case URX_SETREF:
3252         case URX_BACKSLASH_D:
3253         case URX_BACKSLASH_H:
3254         case URX_BACKSLASH_R:
3255         case URX_BACKSLASH_V:
3256         case URX_ONECHAR_I:
3257         case URX_BACKSLASH_X:   // Grahpeme Cluster.  Minimum is 1, max unbounded.
3258         case URX_DOTANY_ALL:    // . matches one or two.
3259         case URX_DOTANY:
3260         case URX_DOTANY_UNIX:
3261             currentLen++;
3262             break;
3263 
3264 
3265         case URX_JMPX:
3266             loc++;              // URX_JMPX has an extra operand, ignored here,
3267                                 //   otherwise processed identically to URX_JMP.
3268             U_FALLTHROUGH;
3269         case URX_JMP:
3270             {
3271                 int32_t  jmpDest = URX_VAL(op);
3272                 if (jmpDest < loc) {
3273                     // Loop of some kind.  Can safely ignore, the worst that will happen
3274                     //  is that we understate the true minimum length
3275                     currentLen = forwardedLength.elementAti(loc+1);
3276                 } else {
3277                     // Forward jump.  Propagate the current min length to the target loc of the jump.
3278                     U_ASSERT(jmpDest <= end+1);
3279                     if (forwardedLength.elementAti(jmpDest) > currentLen) {
3280                         forwardedLength.setElementAt(currentLen, jmpDest);
3281                     }
3282                 }
3283             }
3284             break;
3285 
3286         case URX_BACKTRACK:
3287             {
3288                 // Back-tracks are kind of like a branch, except that the min length was
3289                 //   propagated already, by the state save.
3290                 currentLen = forwardedLength.elementAti(loc+1);
3291             }
3292             break;
3293 
3294 
3295         case URX_STATE_SAVE:
3296             {
3297                 // State Save, for forward jumps, propagate the current minimum.
3298                 //             of the state save.
3299                 int32_t  jmpDest = URX_VAL(op);
3300                 if (jmpDest > loc) {
3301                     if (currentLen < forwardedLength.elementAti(jmpDest)) {
3302                         forwardedLength.setElementAt(currentLen, jmpDest);
3303                     }
3304                 }
3305             }
3306             break;
3307 
3308 
3309         case URX_STRING:
3310             {
3311                 loc++;
3312                 int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3313                 currentLen += URX_VAL(stringLenOp);
3314             }
3315             break;
3316 
3317 
3318         case URX_STRING_I:
3319             {
3320                 loc++;
3321                 // TODO: with full case folding, matching input text may be shorter than
3322                 //       the string we have here.  More smarts could put some bounds on it.
3323                 //       Assume a min length of one for now.  A min length of zero causes
3324                 //        optimization failures for a pattern like "string"+
3325                 // currentLen += URX_VAL(stringLenOp);
3326                 currentLen += 1;
3327             }
3328             break;
3329 
3330         case URX_CTR_INIT:
3331         case URX_CTR_INIT_NG:
3332             {
3333                 // Loop Init Ops.
3334                 //   If the min loop count == 0
3335                 //      move loc forwards to the end of the loop, skipping over the body.
3336                 //   If the min count is > 0,
3337                 //      continue normal processing of the body of the loop.
3338                 int32_t loopEndLoc   = (int32_t)fRXPat->fCompiledPat->elementAti(loc+1);
3339                         loopEndLoc   = URX_VAL(loopEndLoc);
3340                 int32_t minLoopCount = (int32_t)fRXPat->fCompiledPat->elementAti(loc+2);
3341                 if (minLoopCount == 0) {
3342                     loc = loopEndLoc;
3343                 } else {
3344                     loc+=3;  // Skips over operands of CTR_INIT
3345                 }
3346             }
3347             break;
3348 
3349 
3350         case URX_CTR_LOOP:
3351         case URX_CTR_LOOP_NG:
3352             // Loop ops.
3353             //  The jump is conditional, backwards only.
3354             break;
3355 
3356         case URX_LOOP_SR_I:
3357         case URX_LOOP_DOT_I:
3358         case URX_LOOP_C:
3359             // More loop ops.  These state-save to themselves.
3360             //   don't change the minimum match - could match nothing at all.
3361             break;
3362 
3363 
3364         case URX_LA_START:
3365         case URX_LB_START:
3366             {
3367                 // Look-around.  Scan forward until the matching look-ahead end,
3368                 //   without processing the look-around block.  This is overly pessimistic for look-ahead,
3369                 //   it assumes that the look-ahead match might be zero-length.
3370                 //   TODO:  Positive lookahead could recursively do the block, then continue
3371                 //          with the longer of the block or the value coming in.  Ticket 6060
3372                 int32_t  depth = (opType == URX_LA_START? 2: 1);;
3373                 for (;;) {
3374                     loc++;
3375                     op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3376                     if (URX_TYPE(op) == URX_LA_START) {
3377                         // The boilerplate for look-ahead includes two LA_END insturctions,
3378                         //    Depth will be decremented by each one when it is seen.
3379                         depth += 2;
3380                     }
3381                     if (URX_TYPE(op) == URX_LB_START) {
3382                         depth++;
3383                     }
3384                     if (URX_TYPE(op) == URX_LA_END) {
3385                         depth--;
3386                         if (depth == 0) {
3387                             break;
3388                         }
3389                     }
3390                     if (URX_TYPE(op)==URX_LBN_END) {
3391                         depth--;
3392                         if (depth == 0) {
3393                             break;
3394                         }
3395                     }
3396                     if (URX_TYPE(op) == URX_STATE_SAVE) {
3397                         // Need this because neg lookahead blocks will FAIL to outside
3398                         //   of the block.
3399                         int32_t  jmpDest = URX_VAL(op);
3400                         if (jmpDest > loc) {
3401                             if (currentLen < forwardedLength.elementAti(jmpDest)) {
3402                                 forwardedLength.setElementAt(currentLen, jmpDest);
3403                             }
3404                         }
3405                     }
3406                     U_ASSERT(loc <= end);
3407                 }
3408             }
3409             break;
3410 
3411         case URX_LA_END:
3412         case URX_LB_CONT:
3413         case URX_LB_END:
3414         case URX_LBN_CONT:
3415         case URX_LBN_END:
3416             // Only come here if the matching URX_LA_START or URX_LB_START was not in the
3417             //   range being sized, which happens when measuring size of look-behind blocks.
3418             break;
3419 
3420         default:
3421             U_ASSERT(FALSE);
3422             }
3423 
3424         }
3425 
3426     // We have finished walking through the ops.  Check whether some forward jump
3427     //   propagated a shorter length to location end+1.
3428     if (forwardedLength.elementAti(end+1) < currentLen) {
3429         currentLen = forwardedLength.elementAti(end+1);
3430         U_ASSERT(currentLen>=0 && currentLen < INT32_MAX);
3431     }
3432 
3433     return currentLen;
3434 }
3435 
3436 // Increment with overflow check.
3437 // val and delta will both be positive.
3438 
safeIncrement(int32_t val,int32_t delta)3439 static int32_t safeIncrement(int32_t val, int32_t delta) {
3440     if (INT32_MAX - val > delta) {
3441         return val + delta;
3442     } else {
3443         return INT32_MAX;
3444     }
3445 }
3446 
3447 
3448 //------------------------------------------------------------------------------
3449 //
3450 //   maxMatchLength    Calculate the length of the longest string that could
3451 //                     match the specified pattern.
3452 //                     Length is in 16 bit code units, not code points.
3453 //
3454 //                     The calculated length may not be exact.  The returned
3455 //                     value may be longer than the actual maximum; it must
3456 //                     never be shorter.
3457 //
3458 //------------------------------------------------------------------------------
maxMatchLength(int32_t start,int32_t end)3459 int32_t   RegexCompile::maxMatchLength(int32_t start, int32_t end) {
3460     if (U_FAILURE(*fStatus)) {
3461         return 0;
3462     }
3463     U_ASSERT(start <= end);
3464     U_ASSERT(end < fRXPat->fCompiledPat->size());
3465 
3466 
3467     int32_t    loc;
3468     int32_t    op;
3469     int32_t    opType;
3470     int32_t    currentLen = 0;
3471     UVector32  forwardedLength(end+1, *fStatus);
3472     forwardedLength.setSize(end+1);
3473 
3474     for (loc=start; loc<=end; loc++) {
3475         forwardedLength.setElementAt(0, loc);
3476     }
3477 
3478     for (loc = start; loc<=end; loc++) {
3479         op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3480         opType = URX_TYPE(op);
3481 
3482         // The loop is advancing linearly through the pattern.
3483         // If the op we are now at was the destination of a branch in the pattern,
3484         // and that path has a longer maximum length than the current accumulated value,
3485         // replace the current accumulated value.
3486         if (forwardedLength.elementAti(loc) > currentLen) {
3487             currentLen = forwardedLength.elementAti(loc);
3488         }
3489 
3490         switch (opType) {
3491             // Ops that don't change the total length matched
3492         case URX_RESERVED_OP:
3493         case URX_END:
3494         case URX_STRING_LEN:
3495         case URX_NOP:
3496         case URX_START_CAPTURE:
3497         case URX_END_CAPTURE:
3498         case URX_BACKSLASH_B:
3499         case URX_BACKSLASH_BU:
3500         case URX_BACKSLASH_G:
3501         case URX_BACKSLASH_Z:
3502         case URX_CARET:
3503         case URX_DOLLAR:
3504         case URX_DOLLAR_M:
3505         case URX_DOLLAR_D:
3506         case URX_DOLLAR_MD:
3507         case URX_RELOC_OPRND:
3508         case URX_STO_INP_LOC:
3509         case URX_CARET_M:
3510         case URX_CARET_M_UNIX:
3511 
3512         case URX_STO_SP:          // Setup for atomic or possessive blocks.  Doesn't change what can match.
3513         case URX_LD_SP:
3514 
3515         case URX_LB_END:
3516         case URX_LB_CONT:
3517         case URX_LBN_CONT:
3518         case URX_LBN_END:
3519             break;
3520 
3521 
3522             // Ops that increase that cause an unbounded increase in the length
3523             //   of a matched string, or that increase it a hard to characterize way.
3524             //   Call the max length unbounded, and stop further checking.
3525         case URX_BACKREF:         // BackRef.  Must assume that it might be a zero length match
3526         case URX_BACKREF_I:
3527         case URX_BACKSLASH_X:   // Grahpeme Cluster.  Minimum is 1, max unbounded.
3528             currentLen = INT32_MAX;
3529             break;
3530 
3531 
3532             // Ops that match a max of one character (possibly two 16 bit code units.)
3533             //
3534         case URX_STATIC_SETREF:
3535         case URX_STAT_SETREF_N:
3536         case URX_SETREF:
3537         case URX_BACKSLASH_D:
3538         case URX_BACKSLASH_H:
3539         case URX_BACKSLASH_R:
3540         case URX_BACKSLASH_V:
3541         case URX_ONECHAR_I:
3542         case URX_DOTANY_ALL:
3543         case URX_DOTANY:
3544         case URX_DOTANY_UNIX:
3545             currentLen = safeIncrement(currentLen, 2);
3546             break;
3547 
3548             // Single literal character.  Increase current max length by one or two,
3549             //       depending on whether the char is in the supplementary range.
3550         case URX_ONECHAR:
3551             currentLen = safeIncrement(currentLen, 1);
3552             if (URX_VAL(op) > 0x10000) {
3553                 currentLen = safeIncrement(currentLen, 1);
3554             }
3555             break;
3556 
3557             // Jumps.
3558             //
3559         case URX_JMP:
3560         case URX_JMPX:
3561         case URX_JMP_SAV:
3562         case URX_JMP_SAV_X:
3563             {
3564                 int32_t  jmpDest = URX_VAL(op);
3565                 if (jmpDest < loc) {
3566                     // Loop of some kind.  Max match length is unbounded.
3567                     currentLen = INT32_MAX;
3568                 } else {
3569                     // Forward jump.  Propagate the current min length to the target loc of the jump.
3570                     if (forwardedLength.elementAti(jmpDest) < currentLen) {
3571                         forwardedLength.setElementAt(currentLen, jmpDest);
3572                     }
3573                     currentLen = 0;
3574                 }
3575             }
3576             break;
3577 
3578         case URX_BACKTRACK:
3579             // back-tracks are kind of like a branch, except that the max length was
3580             //   propagated already, by the state save.
3581             currentLen = forwardedLength.elementAti(loc+1);
3582             break;
3583 
3584 
3585         case URX_STATE_SAVE:
3586             {
3587                 // State Save, for forward jumps, propagate the current minimum.
3588                 //               of the state save.
3589                 //             For backwards jumps, they create a loop, maximum
3590                 //               match length is unbounded.
3591                 int32_t  jmpDest = URX_VAL(op);
3592                 if (jmpDest > loc) {
3593                     if (currentLen > forwardedLength.elementAti(jmpDest)) {
3594                         forwardedLength.setElementAt(currentLen, jmpDest);
3595                     }
3596                 } else {
3597                     currentLen = INT32_MAX;
3598                 }
3599             }
3600             break;
3601 
3602 
3603 
3604 
3605         case URX_STRING:
3606             {
3607                 loc++;
3608                 int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3609                 currentLen = safeIncrement(currentLen, URX_VAL(stringLenOp));
3610                 break;
3611             }
3612 
3613         case URX_STRING_I:
3614             // TODO:  This code assumes that any user string that matches will be no longer
3615             //        than our compiled string, with case insensitive matching.
3616             //        Our compiled string has been case-folded already.
3617             //
3618             //        Any matching user string will have no more code points than our
3619             //        compiled (folded) string.  Folding may add code points, but
3620             //        not remove them.
3621             //
3622             //        There is a potential problem if a supplemental code point
3623             //        case-folds to a BMP code point.  In this case our compiled string
3624             //        could be shorter (in code units) than a matching user string.
3625             //
3626             //        At this time (Unicode 6.1) there are no such characters, and this case
3627             //        is not being handled.  A test, intltest regex/Bug9283, will fail if
3628             //        any problematic characters are added to Unicode.
3629             //
3630             //        If this happens, we can make a set of the BMP chars that the
3631             //        troublesome supplementals fold to, scan our string, and bump the
3632             //        currentLen one extra for each that is found.
3633             //
3634             {
3635                 loc++;
3636                 int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3637                 currentLen = safeIncrement(currentLen, URX_VAL(stringLenOp));
3638             }
3639             break;
3640 
3641         case URX_CTR_INIT:
3642         case URX_CTR_INIT_NG:
3643             // For Loops, recursively call this function on the pattern for the loop body,
3644             //   then multiply the result by the maximum loop count.
3645             {
3646                 int32_t  loopEndLoc = URX_VAL(fRXPat->fCompiledPat->elementAti(loc+1));
3647                 if (loopEndLoc == loc+4) {
3648                     // Loop has an empty body. No affect on max match length.
3649                     // Continue processing with code after the loop end.
3650                     loc = loopEndLoc;
3651                     break;
3652                 }
3653 
3654                 int32_t maxLoopCount = static_cast<int32_t>(fRXPat->fCompiledPat->elementAti(loc+3));
3655                 if (maxLoopCount == -1) {
3656                     // Unbounded Loop. No upper bound on match length.
3657                     currentLen = INT32_MAX;
3658                     break;
3659                 }
3660 
3661                 U_ASSERT(loopEndLoc >= loc+4);
3662                 int64_t blockLen = maxMatchLength(loc+4, loopEndLoc-1);  // Recursive call.
3663                 int64_t updatedLen = (int64_t)currentLen + blockLen * maxLoopCount;
3664                 if (updatedLen >= INT32_MAX) {
3665                     currentLen = INT32_MAX;
3666                     break;
3667                 }
3668                 currentLen = (int32_t)updatedLen;
3669                 loc = loopEndLoc;
3670                 break;
3671             }
3672 
3673         case URX_CTR_LOOP:
3674         case URX_CTR_LOOP_NG:
3675             // These opcodes will be skipped over by code for URX_CRT_INIT.
3676             // We shouldn't encounter them here.
3677             U_ASSERT(FALSE);
3678             break;
3679 
3680         case URX_LOOP_SR_I:
3681         case URX_LOOP_DOT_I:
3682         case URX_LOOP_C:
3683             // For anything to do with loops, make the match length unbounded.
3684             currentLen = INT32_MAX;
3685             break;
3686 
3687 
3688 
3689         case URX_LA_START:
3690         case URX_LA_END:
3691             // Look-ahead.  Just ignore, treat the look-ahead block as if
3692             // it were normal pattern.  Gives a too-long match length,
3693             //  but good enough for now.
3694             break;
3695 
3696             // End of look-ahead ops should always be consumed by the processing at
3697             //  the URX_LA_START op.
3698             // U_ASSERT(FALSE);
3699             // break;
3700 
3701         case URX_LB_START:
3702             {
3703                 // Look-behind.  Scan forward until the matching look-around end,
3704                 //   without processing the look-behind block.
3705                 int32_t  depth = 0;
3706                 for (;;) {
3707                     loc++;
3708                     op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3709                     if (URX_TYPE(op) == URX_LA_START || URX_TYPE(op) == URX_LB_START) {
3710                         depth++;
3711                     }
3712                     if (URX_TYPE(op) == URX_LA_END || URX_TYPE(op)==URX_LBN_END) {
3713                         if (depth == 0) {
3714                             break;
3715                         }
3716                         depth--;
3717                     }
3718                     U_ASSERT(loc < end);
3719                 }
3720             }
3721             break;
3722 
3723         default:
3724             U_ASSERT(FALSE);
3725         }
3726 
3727 
3728         if (currentLen == INT32_MAX) {
3729             //  The maximum length is unbounded.
3730             //  Stop further processing of the pattern.
3731             break;
3732         }
3733 
3734     }
3735     return currentLen;
3736 
3737 }
3738 
3739 
3740 //------------------------------------------------------------------------------
3741 //
3742 //   stripNOPs    Remove any NOP operations from the compiled pattern code.
3743 //                Extra NOPs are inserted for some constructs during the initial
3744 //                code generation to provide locations that may be patched later.
3745 //                Many end up unneeded, and are removed by this function.
3746 //
3747 //                In order to minimize the number of passes through the pattern,
3748 //                back-reference fixup is also performed here (adjusting
3749 //                back-reference operands to point to the correct frame offsets).
3750 //
3751 //------------------------------------------------------------------------------
stripNOPs()3752 void RegexCompile::stripNOPs() {
3753 
3754     if (U_FAILURE(*fStatus)) {
3755         return;
3756     }
3757 
3758     int32_t    end = fRXPat->fCompiledPat->size();
3759     UVector32  deltas(end, *fStatus);
3760 
3761     // Make a first pass over the code, computing the amount that things
3762     //   will be offset at each location in the original code.
3763     int32_t   loc;
3764     int32_t   d = 0;
3765     for (loc=0; loc<end; loc++) {
3766         deltas.addElement(d, *fStatus);
3767         int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3768         if (URX_TYPE(op) == URX_NOP) {
3769             d++;
3770         }
3771     }
3772 
3773     UnicodeString caseStringBuffer;
3774 
3775     // Make a second pass over the code, removing the NOPs by moving following
3776     //  code up, and patching operands that refer to code locations that
3777     //  are being moved.  The array of offsets from the first step is used
3778     //  to compute the new operand values.
3779     int32_t src;
3780     int32_t dst = 0;
3781     for (src=0; src<end; src++) {
3782         int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(src);
3783         int32_t opType = URX_TYPE(op);
3784         switch (opType) {
3785         case URX_NOP:
3786             break;
3787 
3788         case URX_STATE_SAVE:
3789         case URX_JMP:
3790         case URX_CTR_LOOP:
3791         case URX_CTR_LOOP_NG:
3792         case URX_RELOC_OPRND:
3793         case URX_JMPX:
3794         case URX_JMP_SAV:
3795         case URX_JMP_SAV_X:
3796             // These are instructions with operands that refer to code locations.
3797             {
3798                 int32_t  operandAddress = URX_VAL(op);
3799                 U_ASSERT(operandAddress>=0 && operandAddress<deltas.size());
3800                 int32_t fixedOperandAddress = operandAddress - deltas.elementAti(operandAddress);
3801                 op = buildOp(opType, fixedOperandAddress);
3802                 fRXPat->fCompiledPat->setElementAt(op, dst);
3803                 dst++;
3804                 break;
3805             }
3806 
3807         case URX_BACKREF:
3808         case URX_BACKREF_I:
3809             {
3810                 int32_t where = URX_VAL(op);
3811                 if (where > fRXPat->fGroupMap->size()) {
3812                     error(U_REGEX_INVALID_BACK_REF);
3813                     break;
3814                 }
3815                 where = fRXPat->fGroupMap->elementAti(where-1);
3816                 op    = buildOp(opType, where);
3817                 fRXPat->fCompiledPat->setElementAt(op, dst);
3818                 dst++;
3819 
3820                 fRXPat->fNeedsAltInput = TRUE;
3821                 break;
3822             }
3823         case URX_RESERVED_OP:
3824         case URX_RESERVED_OP_N:
3825         case URX_BACKTRACK:
3826         case URX_END:
3827         case URX_ONECHAR:
3828         case URX_STRING:
3829         case URX_STRING_LEN:
3830         case URX_START_CAPTURE:
3831         case URX_END_CAPTURE:
3832         case URX_STATIC_SETREF:
3833         case URX_STAT_SETREF_N:
3834         case URX_SETREF:
3835         case URX_DOTANY:
3836         case URX_FAIL:
3837         case URX_BACKSLASH_B:
3838         case URX_BACKSLASH_BU:
3839         case URX_BACKSLASH_G:
3840         case URX_BACKSLASH_X:
3841         case URX_BACKSLASH_Z:
3842         case URX_DOTANY_ALL:
3843         case URX_BACKSLASH_D:
3844         case URX_CARET:
3845         case URX_DOLLAR:
3846         case URX_CTR_INIT:
3847         case URX_CTR_INIT_NG:
3848         case URX_DOTANY_UNIX:
3849         case URX_STO_SP:
3850         case URX_LD_SP:
3851         case URX_STO_INP_LOC:
3852         case URX_LA_START:
3853         case URX_LA_END:
3854         case URX_ONECHAR_I:
3855         case URX_STRING_I:
3856         case URX_DOLLAR_M:
3857         case URX_CARET_M:
3858         case URX_CARET_M_UNIX:
3859         case URX_LB_START:
3860         case URX_LB_CONT:
3861         case URX_LB_END:
3862         case URX_LBN_CONT:
3863         case URX_LBN_END:
3864         case URX_LOOP_SR_I:
3865         case URX_LOOP_DOT_I:
3866         case URX_LOOP_C:
3867         case URX_DOLLAR_D:
3868         case URX_DOLLAR_MD:
3869         case URX_BACKSLASH_H:
3870         case URX_BACKSLASH_R:
3871         case URX_BACKSLASH_V:
3872             // These instructions are unaltered by the relocation.
3873             fRXPat->fCompiledPat->setElementAt(op, dst);
3874             dst++;
3875             break;
3876 
3877         default:
3878             // Some op is unaccounted for.
3879             U_ASSERT(FALSE);
3880             error(U_REGEX_INTERNAL_ERROR);
3881         }
3882     }
3883 
3884     fRXPat->fCompiledPat->setSize(dst);
3885 }
3886 
3887 
3888 
3889 
3890 //------------------------------------------------------------------------------
3891 //
3892 //  Error         Report a rule parse error.
3893 //                Only report it if no previous error has been recorded.
3894 //
3895 //------------------------------------------------------------------------------
error(UErrorCode e)3896 void RegexCompile::error(UErrorCode e) {
3897     if (U_SUCCESS(*fStatus)) {
3898         *fStatus = e;
3899         // Hmm. fParseErr (UParseError) line & offset fields are int32_t in public
3900         // API (see common/unicode/parseerr.h), while fLineNum and fCharNum are
3901         // int64_t. If the values of the latter are out of range for the former,
3902         // set them to the appropriate "field not supported" values.
3903         if (fLineNum > 0x7FFFFFFF) {
3904             fParseErr->line   = 0;
3905             fParseErr->offset = -1;
3906         } else if (fCharNum > 0x7FFFFFFF) {
3907             fParseErr->line   = (int32_t)fLineNum;
3908             fParseErr->offset = -1;
3909         } else {
3910             fParseErr->line   = (int32_t)fLineNum;
3911             fParseErr->offset = (int32_t)fCharNum;
3912         }
3913 
3914         UErrorCode status = U_ZERO_ERROR; // throwaway status for extracting context
3915 
3916         // Fill in the context.
3917         //   Note: extractBetween() pins supplied indicies to the string bounds.
3918         uprv_memset(fParseErr->preContext,  0, sizeof(fParseErr->preContext));
3919         uprv_memset(fParseErr->postContext, 0, sizeof(fParseErr->postContext));
3920         utext_extract(fRXPat->fPattern, fScanIndex-U_PARSE_CONTEXT_LEN+1, fScanIndex, fParseErr->preContext, U_PARSE_CONTEXT_LEN, &status);
3921         utext_extract(fRXPat->fPattern, fScanIndex, fScanIndex+U_PARSE_CONTEXT_LEN-1, fParseErr->postContext, U_PARSE_CONTEXT_LEN, &status);
3922     }
3923 }
3924 
3925 
3926 //
3927 //  Assorted Unicode character constants.
3928 //     Numeric because there is no portable way to enter them as literals.
3929 //     (Think EBCDIC).
3930 //
3931 static const UChar      chCR        = 0x0d;      // New lines, for terminating comments.
3932 static const UChar      chLF        = 0x0a;      // Line Feed
3933 static const UChar      chPound     = 0x23;      // '#', introduces a comment.
3934 static const UChar      chDigit0    = 0x30;      // '0'
3935 static const UChar      chDigit7    = 0x37;      // '9'
3936 static const UChar      chColon     = 0x3A;      // ':'
3937 static const UChar      chE         = 0x45;      // 'E'
3938 static const UChar      chQ         = 0x51;      // 'Q'
3939 //static const UChar      chN         = 0x4E;      // 'N'
3940 static const UChar      chP         = 0x50;      // 'P'
3941 static const UChar      chBackSlash = 0x5c;      // '\'  introduces a char escape
3942 //static const UChar      chLBracket  = 0x5b;      // '['
3943 static const UChar      chRBracket  = 0x5d;      // ']'
3944 static const UChar      chUp        = 0x5e;      // '^'
3945 static const UChar      chLowerP    = 0x70;
3946 static const UChar      chLBrace    = 0x7b;      // '{'
3947 static const UChar      chRBrace    = 0x7d;      // '}'
3948 static const UChar      chNEL       = 0x85;      //    NEL newline variant
3949 static const UChar      chLS        = 0x2028;    //    Unicode Line Separator
3950 
3951 
3952 //------------------------------------------------------------------------------
3953 //
3954 //  nextCharLL    Low Level Next Char from the regex pattern.
3955 //                Get a char from the string, keep track of input position
3956 //                     for error reporting.
3957 //
3958 //------------------------------------------------------------------------------
nextCharLL()3959 UChar32  RegexCompile::nextCharLL() {
3960     UChar32       ch;
3961 
3962     if (fPeekChar != -1) {
3963         ch = fPeekChar;
3964         fPeekChar = -1;
3965         return ch;
3966     }
3967 
3968     // assume we're already in the right place
3969     ch = UTEXT_NEXT32(fRXPat->fPattern);
3970     if (ch == U_SENTINEL) {
3971         return ch;
3972     }
3973 
3974     if (ch == chCR ||
3975         ch == chNEL ||
3976         ch == chLS   ||
3977         (ch == chLF && fLastChar != chCR)) {
3978         // Character is starting a new line.  Bump up the line number, and
3979         //  reset the column to 0.
3980         fLineNum++;
3981         fCharNum=0;
3982     }
3983     else {
3984         // Character is not starting a new line.  Except in the case of a
3985         //   LF following a CR, increment the column position.
3986         if (ch != chLF) {
3987             fCharNum++;
3988         }
3989     }
3990     fLastChar = ch;
3991     return ch;
3992 }
3993 
3994 //------------------------------------------------------------------------------
3995 //
3996 //   peekCharLL    Low Level Character Scanning, sneak a peek at the next
3997 //                 character without actually getting it.
3998 //
3999 //------------------------------------------------------------------------------
peekCharLL()4000 UChar32  RegexCompile::peekCharLL() {
4001     if (fPeekChar == -1) {
4002         fPeekChar = nextCharLL();
4003     }
4004     return fPeekChar;
4005 }
4006 
4007 
4008 //------------------------------------------------------------------------------
4009 //
4010 //   nextChar     for pattern scanning.  At this level, we handle stripping
4011 //                out comments and processing some backslash character escapes.
4012 //                The rest of the pattern grammar is handled at the next level up.
4013 //
4014 //------------------------------------------------------------------------------
nextChar(RegexPatternChar & c)4015 void RegexCompile::nextChar(RegexPatternChar &c) {
4016 
4017     fScanIndex = UTEXT_GETNATIVEINDEX(fRXPat->fPattern);
4018     c.fChar    = nextCharLL();
4019     c.fQuoted  = FALSE;
4020 
4021     if (fQuoteMode) {
4022         c.fQuoted = TRUE;
4023         if ((c.fChar==chBackSlash && peekCharLL()==chE && ((fModeFlags & UREGEX_LITERAL) == 0)) ||
4024             c.fChar == (UChar32)-1) {
4025             fQuoteMode = FALSE;  //  Exit quote mode,
4026             nextCharLL();        // discard the E
4027             nextChar(c);         // recurse to get the real next char
4028         }
4029     }
4030     else if (fInBackslashQuote) {
4031         // The current character immediately follows a '\'
4032         // Don't check for any further escapes, just return it as-is.
4033         // Don't set c.fQuoted, because that would prevent the state machine from
4034         //    dispatching on the character.
4035         fInBackslashQuote = FALSE;
4036     }
4037     else
4038     {
4039         // We are not in a \Q quoted region \E of the source.
4040         //
4041         if (fModeFlags & UREGEX_COMMENTS) {
4042             //
4043             // We are in free-spacing and comments mode.
4044             //  Scan through any white space and comments, until we
4045             //  reach a significant character or the end of inut.
4046             for (;;) {
4047                 if (c.fChar == (UChar32)-1) {
4048                     break;     // End of Input
4049                 }
4050                 if  (c.fChar == chPound && fEOLComments == TRUE) {
4051                     // Start of a comment.  Consume the rest of it, until EOF or a new line
4052                     for (;;) {
4053                         c.fChar = nextCharLL();
4054                         if (c.fChar == (UChar32)-1 ||  // EOF
4055                             c.fChar == chCR        ||
4056                             c.fChar == chLF        ||
4057                             c.fChar == chNEL       ||
4058                             c.fChar == chLS)       {
4059                             break;
4060                         }
4061                     }
4062                 }
4063                 // TODO:  check what Java & Perl do with non-ASCII white spaces.  Ticket 6061.
4064                 if (PatternProps::isWhiteSpace(c.fChar) == FALSE) {
4065                     break;
4066                 }
4067                 c.fChar = nextCharLL();
4068             }
4069         }
4070 
4071         //
4072         //  check for backslash escaped characters.
4073         //
4074         if (c.fChar == chBackSlash) {
4075             int64_t pos = UTEXT_GETNATIVEINDEX(fRXPat->fPattern);
4076             if (RegexStaticSets::gStaticSets->fUnescapeCharSet.contains(peekCharLL())) {
4077                 //
4078                 // A '\' sequence that is handled by ICU's standard unescapeAt function.
4079                 //   Includes \uxxxx, \n, \r, many others.
4080                 //   Return the single equivalent character.
4081                 //
4082                 nextCharLL();                 // get & discard the peeked char.
4083                 c.fQuoted = TRUE;
4084 
4085                 if (UTEXT_FULL_TEXT_IN_CHUNK(fRXPat->fPattern, fPatternLength)) {
4086                     int32_t endIndex = (int32_t)pos;
4087                     c.fChar = u_unescapeAt(uregex_ucstr_unescape_charAt, &endIndex, (int32_t)fPatternLength, (void *)fRXPat->fPattern->chunkContents);
4088 
4089                     if (endIndex == pos) {
4090                         error(U_REGEX_BAD_ESCAPE_SEQUENCE);
4091                     }
4092                     fCharNum += endIndex - pos;
4093                     UTEXT_SETNATIVEINDEX(fRXPat->fPattern, endIndex);
4094                 } else {
4095                     int32_t offset = 0;
4096                     struct URegexUTextUnescapeCharContext context = U_REGEX_UTEXT_UNESCAPE_CONTEXT(fRXPat->fPattern);
4097 
4098                     UTEXT_SETNATIVEINDEX(fRXPat->fPattern, pos);
4099                     c.fChar = u_unescapeAt(uregex_utext_unescape_charAt, &offset, INT32_MAX, &context);
4100 
4101                     if (offset == 0) {
4102                         error(U_REGEX_BAD_ESCAPE_SEQUENCE);
4103                     } else if (context.lastOffset == offset) {
4104                         UTEXT_PREVIOUS32(fRXPat->fPattern);
4105                     } else if (context.lastOffset != offset-1) {
4106                         utext_moveIndex32(fRXPat->fPattern, offset - context.lastOffset - 1);
4107                     }
4108                     fCharNum += offset;
4109                 }
4110             }
4111             else if (peekCharLL() == chDigit0) {
4112                 //  Octal Escape, using Java Regexp Conventions
4113                 //    which are \0 followed by 1-3 octal digits.
4114                 //    Different from ICU Unescape handling of Octal, which does not
4115                 //    require the leading 0.
4116                 //  Java also has the convention of only consuming 2 octal digits if
4117                 //    the three digit number would be > 0xff
4118                 //
4119                 c.fChar = 0;
4120                 nextCharLL();    // Consume the initial 0.
4121                 int index;
4122                 for (index=0; index<3; index++) {
4123                     int32_t ch = peekCharLL();
4124                     if (ch<chDigit0 || ch>chDigit7) {
4125                         if (index==0) {
4126                            // \0 is not followed by any octal digits.
4127                            error(U_REGEX_BAD_ESCAPE_SEQUENCE);
4128                         }
4129                         break;
4130                     }
4131                     c.fChar <<= 3;
4132                     c.fChar += ch&7;
4133                     if (c.fChar <= 255) {
4134                         nextCharLL();
4135                     } else {
4136                         // The last digit made the number too big.  Forget we saw it.
4137                         c.fChar >>= 3;
4138                     }
4139                 }
4140                 c.fQuoted = TRUE;
4141             }
4142             else if (peekCharLL() == chQ) {
4143                 //  "\Q"  enter quote mode, which will continue until "\E"
4144                 fQuoteMode = TRUE;
4145                 nextCharLL();       // discard the 'Q'.
4146                 nextChar(c);        // recurse to get the real next char.
4147             }
4148             else
4149             {
4150                 // We are in a '\' escape that will be handled by the state table scanner.
4151                 // Just return the backslash, but remember that the following char is to
4152                 //  be taken literally.
4153                 fInBackslashQuote = TRUE;
4154             }
4155         }
4156     }
4157 
4158     // re-enable # to end-of-line comments, in case they were disabled.
4159     // They are disabled by the parser upon seeing '(?', but this lasts for
4160     //  the fetching of the next character only.
4161     fEOLComments = TRUE;
4162 
4163     // putc(c.fChar, stdout);
4164 }
4165 
4166 
4167 
4168 //------------------------------------------------------------------------------
4169 //
4170 //  scanNamedChar
4171 //            Get a UChar32 from a \N{UNICODE CHARACTER NAME} in the pattern.
4172 //
4173 //             The scan position will be at the 'N'.  On return
4174 //             the scan position should be just after the '}'
4175 //
4176 //             Return the UChar32
4177 //
4178 //------------------------------------------------------------------------------
scanNamedChar()4179 UChar32  RegexCompile::scanNamedChar() {
4180     if (U_FAILURE(*fStatus)) {
4181         return 0;
4182     }
4183 
4184     nextChar(fC);
4185     if (fC.fChar != chLBrace) {
4186         error(U_REGEX_PROPERTY_SYNTAX);
4187         return 0;
4188     }
4189 
4190     UnicodeString  charName;
4191     for (;;) {
4192         nextChar(fC);
4193         if (fC.fChar == chRBrace) {
4194             break;
4195         }
4196         if (fC.fChar == -1) {
4197             error(U_REGEX_PROPERTY_SYNTAX);
4198             return 0;
4199         }
4200         charName.append(fC.fChar);
4201     }
4202 
4203     char name[100];
4204     if (!uprv_isInvariantUString(charName.getBuffer(), charName.length()) ||
4205          (uint32_t)charName.length()>=sizeof(name)) {
4206         // All Unicode character names have only invariant characters.
4207         // The API to get a character, given a name, accepts only char *, forcing us to convert,
4208         //   which requires this error check
4209         error(U_REGEX_PROPERTY_SYNTAX);
4210         return 0;
4211     }
4212     charName.extract(0, charName.length(), name, sizeof(name), US_INV);
4213 
4214     UChar32  theChar = u_charFromName(U_UNICODE_CHAR_NAME, name, fStatus);
4215     if (U_FAILURE(*fStatus)) {
4216         error(U_REGEX_PROPERTY_SYNTAX);
4217     }
4218 
4219     nextChar(fC);      // Continue overall regex pattern processing with char after the '}'
4220     return theChar;
4221 }
4222 
4223 //------------------------------------------------------------------------------
4224 //
4225 //  scanProp   Construct a UnicodeSet from the text at the current scan
4226 //             position, which will be of the form \p{whaterver}
4227 //
4228 //             The scan position will be at the 'p' or 'P'.  On return
4229 //             the scan position should be just after the '}'
4230 //
4231 //             Return a UnicodeSet, constructed from the \P pattern,
4232 //             or NULL if the pattern is invalid.
4233 //
4234 //------------------------------------------------------------------------------
scanProp()4235 UnicodeSet *RegexCompile::scanProp() {
4236     UnicodeSet    *uset = NULL;
4237 
4238     if (U_FAILURE(*fStatus)) {
4239         return NULL;
4240     }
4241     (void)chLowerP;   // Suppress compiler unused variable warning.
4242     U_ASSERT(fC.fChar == chLowerP || fC.fChar == chP);
4243     UBool negated = (fC.fChar == chP);
4244 
4245     UnicodeString propertyName;
4246     nextChar(fC);
4247     if (fC.fChar != chLBrace) {
4248         error(U_REGEX_PROPERTY_SYNTAX);
4249         return NULL;
4250     }
4251     for (;;) {
4252         nextChar(fC);
4253         if (fC.fChar == chRBrace) {
4254             break;
4255         }
4256         if (fC.fChar == -1) {
4257             // Hit the end of the input string without finding the closing '}'
4258             error(U_REGEX_PROPERTY_SYNTAX);
4259             return NULL;
4260         }
4261         propertyName.append(fC.fChar);
4262     }
4263     uset = createSetForProperty(propertyName, negated);
4264     nextChar(fC);    // Move input scan to position following the closing '}'
4265     return uset;
4266 }
4267 
4268 //------------------------------------------------------------------------------
4269 //
4270 //  scanPosixProp   Construct a UnicodeSet from the text at the current scan
4271 //             position, which is expected be of the form [:property expression:]
4272 //
4273 //             The scan position will be at the opening ':'.  On return
4274 //             the scan position must be on the closing ']'
4275 //
4276 //             Return a UnicodeSet constructed from the pattern,
4277 //             or NULL if this is not a valid POSIX-style set expression.
4278 //             If not a property expression, restore the initial scan position
4279 //                (to the opening ':')
4280 //
4281 //               Note:  the opening '[:' is not sufficient to guarantee that
4282 //                      this is a [:property:] expression.
4283 //                      [:'+=,] is a perfectly good ordinary set expression that
4284 //                              happens to include ':' as one of its characters.
4285 //
4286 //------------------------------------------------------------------------------
scanPosixProp()4287 UnicodeSet *RegexCompile::scanPosixProp() {
4288     UnicodeSet    *uset = NULL;
4289 
4290     if (U_FAILURE(*fStatus)) {
4291         return NULL;
4292     }
4293 
4294     U_ASSERT(fC.fChar == chColon);
4295 
4296     // Save the scanner state.
4297     // TODO:  move this into the scanner, with the state encapsulated in some way.  Ticket 6062
4298     int64_t     savedScanIndex        = fScanIndex;
4299     int64_t     savedNextIndex        = UTEXT_GETNATIVEINDEX(fRXPat->fPattern);
4300     UBool       savedQuoteMode        = fQuoteMode;
4301     UBool       savedInBackslashQuote = fInBackslashQuote;
4302     UBool       savedEOLComments      = fEOLComments;
4303     int64_t     savedLineNum          = fLineNum;
4304     int64_t     savedCharNum          = fCharNum;
4305     UChar32     savedLastChar         = fLastChar;
4306     UChar32     savedPeekChar         = fPeekChar;
4307     RegexPatternChar savedfC          = fC;
4308 
4309     // Scan for a closing ].   A little tricky because there are some perverse
4310     //   edge cases possible.  "[:abc\Qdef:] \E]"  is a valid non-property expression,
4311     //   ending on the second closing ].
4312 
4313     UnicodeString propName;
4314     UBool         negated  = FALSE;
4315 
4316     // Check for and consume the '^' in a negated POSIX property, e.g.  [:^Letter:]
4317     nextChar(fC);
4318     if (fC.fChar == chUp) {
4319        negated = TRUE;
4320        nextChar(fC);
4321     }
4322 
4323     // Scan for the closing ":]", collecting the property name along the way.
4324     UBool  sawPropSetTerminator = FALSE;
4325     for (;;) {
4326         propName.append(fC.fChar);
4327         nextChar(fC);
4328         if (fC.fQuoted || fC.fChar == -1) {
4329             // Escaped characters or end of input - either says this isn't a [:Property:]
4330             break;
4331         }
4332         if (fC.fChar == chColon) {
4333             nextChar(fC);
4334             if (fC.fChar == chRBracket) {
4335                 sawPropSetTerminator = TRUE;
4336             }
4337             break;
4338         }
4339     }
4340 
4341     if (sawPropSetTerminator) {
4342         uset = createSetForProperty(propName, negated);
4343     }
4344     else
4345     {
4346         // No closing ":]".
4347         //  Restore the original scan position.
4348         //  The main scanner will retry the input as a normal set expression,
4349         //    not a [:Property:] expression.
4350         fScanIndex        = savedScanIndex;
4351         fQuoteMode        = savedQuoteMode;
4352         fInBackslashQuote = savedInBackslashQuote;
4353         fEOLComments      = savedEOLComments;
4354         fLineNum          = savedLineNum;
4355         fCharNum          = savedCharNum;
4356         fLastChar         = savedLastChar;
4357         fPeekChar         = savedPeekChar;
4358         fC                = savedfC;
4359         UTEXT_SETNATIVEINDEX(fRXPat->fPattern, savedNextIndex);
4360     }
4361     return uset;
4362 }
4363 
addIdentifierIgnorable(UnicodeSet * set,UErrorCode & ec)4364 static inline void addIdentifierIgnorable(UnicodeSet *set, UErrorCode& ec) {
4365     set->add(0, 8).add(0x0e, 0x1b).add(0x7f, 0x9f);
4366     addCategory(set, U_GC_CF_MASK, ec);
4367 }
4368 
4369 //
4370 //  Create a Unicode Set from a Unicode Property expression.
4371 //     This is common code underlying both \p{...} ane [:...:] expressions.
4372 //     Includes trying the Java "properties" that aren't supported as
4373 //     normal ICU UnicodeSet properties
4374 //
4375 static const UChar posSetPrefix[] = {0x5b, 0x5c, 0x70, 0x7b, 0}; // "[\p{"
4376 static const UChar negSetPrefix[] = {0x5b, 0x5c, 0x50, 0x7b, 0}; // "[\P{"
createSetForProperty(const UnicodeString & propName,UBool negated)4377 UnicodeSet *RegexCompile::createSetForProperty(const UnicodeString &propName, UBool negated) {
4378     UnicodeString   setExpr;
4379     UnicodeSet      *set;
4380     uint32_t        usetFlags = 0;
4381 
4382     if (U_FAILURE(*fStatus)) {
4383         return NULL;
4384     }
4385 
4386     //
4387     //  First try the property as we received it
4388     //
4389     if (negated) {
4390         setExpr.append(negSetPrefix, -1);
4391     } else {
4392         setExpr.append(posSetPrefix, -1);
4393     }
4394     setExpr.append(propName);
4395     setExpr.append(chRBrace);
4396     setExpr.append(chRBracket);
4397     if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
4398         usetFlags |= USET_CASE_INSENSITIVE;
4399     }
4400     set = new UnicodeSet(setExpr, usetFlags, NULL, *fStatus);
4401     if (U_SUCCESS(*fStatus)) {
4402        return set;
4403     }
4404     delete set;
4405     set = NULL;
4406 
4407     //
4408     //  The property as it was didn't work.
4409 
4410     //  Do [:word:]. It is not recognized as a property by UnicodeSet.  "word" not standard POSIX
4411     //     or standard Java, but many other regular expression packages do recognize it.
4412 
4413     if (propName.caseCompare(UNICODE_STRING_SIMPLE("word"), 0) == 0) {
4414         *fStatus = U_ZERO_ERROR;
4415         set = new UnicodeSet(*(fRXPat->fStaticSets[URX_ISWORD_SET]));
4416         if (set == NULL) {
4417             *fStatus = U_MEMORY_ALLOCATION_ERROR;
4418             return set;
4419         }
4420         if (negated) {
4421             set->complement();
4422         }
4423         return set;
4424     }
4425 
4426 
4427     //    Do Java fixes -
4428     //       InGreek -> InGreek or Coptic, that being the official Unicode name for that block.
4429     //       InCombiningMarksforSymbols -> InCombiningDiacriticalMarksforSymbols.
4430     //
4431     //       Note on Spaces:  either "InCombiningMarksForSymbols" or "InCombining Marks for Symbols"
4432     //                        is accepted by Java.  The property part of the name is compared
4433     //                        case-insenstively.  The spaces must be exactly as shown, either
4434     //                        all there, or all omitted, with exactly one at each position
4435     //                        if they are present.  From checking against JDK 1.6
4436     //
4437     //       This code should be removed when ICU properties support the Java  compatibility names
4438     //          (ICU 4.0?)
4439     //
4440     UnicodeString mPropName = propName;
4441     if (mPropName.caseCompare(UNICODE_STRING_SIMPLE("InGreek"), 0) == 0) {
4442         mPropName = UNICODE_STRING_SIMPLE("InGreek and Coptic");
4443     }
4444     if (mPropName.caseCompare(UNICODE_STRING_SIMPLE("InCombining Marks for Symbols"), 0) == 0 ||
4445         mPropName.caseCompare(UNICODE_STRING_SIMPLE("InCombiningMarksforSymbols"), 0) == 0) {
4446         mPropName = UNICODE_STRING_SIMPLE("InCombining Diacritical Marks for Symbols");
4447     }
4448     else if (mPropName.compare(UNICODE_STRING_SIMPLE("all")) == 0) {
4449         mPropName = UNICODE_STRING_SIMPLE("javaValidCodePoint");
4450     }
4451 
4452     //    See if the property looks like a Java "InBlockName", which
4453     //    we will recast as "Block=BlockName"
4454     //
4455     static const UChar IN[] = {0x49, 0x6E, 0};  // "In"
4456     static const UChar BLOCK[] = {0x42, 0x6C, 0x6f, 0x63, 0x6b, 0x3d, 00};  // "Block="
4457     if (mPropName.startsWith(IN, 2) && propName.length()>=3) {
4458         setExpr.truncate(4);   // Leaves "[\p{", or "[\P{"
4459         setExpr.append(BLOCK, -1);
4460         setExpr.append(UnicodeString(mPropName, 2));  // Property with the leading "In" removed.
4461         setExpr.append(chRBrace);
4462         setExpr.append(chRBracket);
4463         *fStatus = U_ZERO_ERROR;
4464         set = new UnicodeSet(setExpr, usetFlags, NULL, *fStatus);
4465         if (U_SUCCESS(*fStatus)) {
4466             return set;
4467         }
4468         delete set;
4469         set = NULL;
4470     }
4471 
4472     if (propName.startsWith(UNICODE_STRING_SIMPLE("java")) ||
4473         propName.compare(UNICODE_STRING_SIMPLE("all")) == 0)
4474     {
4475         UErrorCode localStatus = U_ZERO_ERROR;
4476         //setExpr.remove();
4477         set = new UnicodeSet();
4478         //
4479         //  Try the various Java specific properties.
4480         //   These all begin with "java"
4481         //
4482         if (mPropName.compare(UNICODE_STRING_SIMPLE("javaDefined")) == 0) {
4483             addCategory(set, U_GC_CN_MASK, localStatus);
4484             set->complement();
4485         }
4486         else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaDigit")) == 0) {
4487             addCategory(set, U_GC_ND_MASK, localStatus);
4488         }
4489         else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaIdentifierIgnorable")) == 0) {
4490             addIdentifierIgnorable(set, localStatus);
4491         }
4492         else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaISOControl")) == 0) {
4493             set->add(0, 0x1F).add(0x7F, 0x9F);
4494         }
4495         else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaJavaIdentifierPart")) == 0) {
4496             addCategory(set, U_GC_L_MASK, localStatus);
4497             addCategory(set, U_GC_SC_MASK, localStatus);
4498             addCategory(set, U_GC_PC_MASK, localStatus);
4499             addCategory(set, U_GC_ND_MASK, localStatus);
4500             addCategory(set, U_GC_NL_MASK, localStatus);
4501             addCategory(set, U_GC_MC_MASK, localStatus);
4502             addCategory(set, U_GC_MN_MASK, localStatus);
4503             addIdentifierIgnorable(set, localStatus);
4504         }
4505         else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaJavaIdentifierStart")) == 0) {
4506             addCategory(set, U_GC_L_MASK, localStatus);
4507             addCategory(set, U_GC_NL_MASK, localStatus);
4508             addCategory(set, U_GC_SC_MASK, localStatus);
4509             addCategory(set, U_GC_PC_MASK, localStatus);
4510         }
4511         else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaLetter")) == 0) {
4512             addCategory(set, U_GC_L_MASK, localStatus);
4513         }
4514         else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaLetterOrDigit")) == 0) {
4515             addCategory(set, U_GC_L_MASK, localStatus);
4516             addCategory(set, U_GC_ND_MASK, localStatus);
4517         }
4518         else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaLowerCase")) == 0) {
4519             addCategory(set, U_GC_LL_MASK, localStatus);
4520         }
4521         else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaMirrored")) == 0) {
4522             set->applyIntPropertyValue(UCHAR_BIDI_MIRRORED, 1, localStatus);
4523         }
4524         else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaSpaceChar")) == 0) {
4525             addCategory(set, U_GC_Z_MASK, localStatus);
4526         }
4527         else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaSupplementaryCodePoint")) == 0) {
4528             set->add(0x10000, UnicodeSet::MAX_VALUE);
4529         }
4530         else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaTitleCase")) == 0) {
4531             addCategory(set, U_GC_LT_MASK, localStatus);
4532         }
4533         else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaUnicodeIdentifierStart")) == 0) {
4534             addCategory(set, U_GC_L_MASK, localStatus);
4535             addCategory(set, U_GC_NL_MASK, localStatus);
4536         }
4537         else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaUnicodeIdentifierPart")) == 0) {
4538             addCategory(set, U_GC_L_MASK, localStatus);
4539             addCategory(set, U_GC_PC_MASK, localStatus);
4540             addCategory(set, U_GC_ND_MASK, localStatus);
4541             addCategory(set, U_GC_NL_MASK, localStatus);
4542             addCategory(set, U_GC_MC_MASK, localStatus);
4543             addCategory(set, U_GC_MN_MASK, localStatus);
4544             addIdentifierIgnorable(set, localStatus);
4545         }
4546         else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaUpperCase")) == 0) {
4547             addCategory(set, U_GC_LU_MASK, localStatus);
4548         }
4549         else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaValidCodePoint")) == 0) {
4550             set->add(0, UnicodeSet::MAX_VALUE);
4551         }
4552         else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaWhitespace")) == 0) {
4553             addCategory(set, U_GC_Z_MASK, localStatus);
4554             set->removeAll(UnicodeSet().add(0xa0).add(0x2007).add(0x202f));
4555             set->add(9, 0x0d).add(0x1c, 0x1f);
4556         }
4557         else if (mPropName.compare(UNICODE_STRING_SIMPLE("all")) == 0) {
4558             set->add(0, UnicodeSet::MAX_VALUE);
4559         }
4560 
4561         if (U_SUCCESS(localStatus) && !set->isEmpty()) {
4562             *fStatus = U_ZERO_ERROR;
4563             if (usetFlags & USET_CASE_INSENSITIVE) {
4564                 set->closeOver(USET_CASE_INSENSITIVE);
4565             }
4566             if (negated) {
4567                 set->complement();
4568             }
4569             return set;
4570         }
4571         delete set;
4572         set = NULL;
4573     }
4574     error(*fStatus);
4575     return NULL;
4576 }
4577 
4578 
4579 
4580 //
4581 //  SetEval   Part of the evaluation of [set expressions].
4582 //            Perform any pending (stacked) operations with precedence
4583 //            equal or greater to that of the next operator encountered
4584 //            in the expression.
4585 //
setEval(int32_t nextOp)4586 void RegexCompile::setEval(int32_t nextOp) {
4587     UnicodeSet *rightOperand = NULL;
4588     UnicodeSet *leftOperand  = NULL;
4589     for (;;) {
4590         U_ASSERT(fSetOpStack.empty()==FALSE);
4591         int32_t pendingSetOperation = fSetOpStack.peeki();
4592         if ((pendingSetOperation&0xffff0000) < (nextOp&0xffff0000)) {
4593             break;
4594         }
4595         fSetOpStack.popi();
4596         U_ASSERT(fSetStack.empty() == FALSE);
4597         rightOperand = (UnicodeSet *)fSetStack.peek();
4598         switch (pendingSetOperation) {
4599             case setNegation:
4600                 rightOperand->complement();
4601                 break;
4602             case setCaseClose:
4603                 // TODO: need a simple close function.  Ticket 6065
4604                 rightOperand->closeOver(USET_CASE_INSENSITIVE);
4605                 rightOperand->removeAllStrings();
4606                 break;
4607             case setDifference1:
4608             case setDifference2:
4609                 fSetStack.pop();
4610                 leftOperand = (UnicodeSet *)fSetStack.peek();
4611                 leftOperand->removeAll(*rightOperand);
4612                 delete rightOperand;
4613                 break;
4614             case setIntersection1:
4615             case setIntersection2:
4616                 fSetStack.pop();
4617                 leftOperand = (UnicodeSet *)fSetStack.peek();
4618                 leftOperand->retainAll(*rightOperand);
4619                 delete rightOperand;
4620                 break;
4621             case setUnion:
4622                 fSetStack.pop();
4623                 leftOperand = (UnicodeSet *)fSetStack.peek();
4624                 leftOperand->addAll(*rightOperand);
4625                 delete rightOperand;
4626                 break;
4627             default:
4628                 U_ASSERT(FALSE);
4629                 break;
4630             }
4631         }
4632     }
4633 
setPushOp(int32_t op)4634 void RegexCompile::setPushOp(int32_t op) {
4635     setEval(op);
4636     fSetOpStack.push(op, *fStatus);
4637     fSetStack.push(new UnicodeSet(), *fStatus);
4638 }
4639 
4640 U_NAMESPACE_END
4641 #endif  // !UCONFIG_NO_REGULAR_EXPRESSIONS
4642 
4643