• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 //
4 //  file:  regexcmp.cpp
5 //
6 //  Copyright (C) 2002-2016 International Business Machines Corporation and others.
7 //  All Rights Reserved.
8 //
9 //  This file contains the ICU regular expression compiler, which is responsible
10 //  for processing a regular expression pattern into the compiled form that
11 //  is used by the match finding engine.
12 //
13 
14 #include "unicode/utypes.h"
15 
16 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
17 
18 #include "unicode/ustring.h"
19 #include "unicode/unistr.h"
20 #include "unicode/uniset.h"
21 #include "unicode/uchar.h"
22 #include "unicode/uchriter.h"
23 #include "unicode/parsepos.h"
24 #include "unicode/parseerr.h"
25 #include "unicode/regex.h"
26 #include "unicode/utf.h"
27 #include "unicode/utf16.h"
28 #include "patternprops.h"
29 #include "putilimp.h"
30 #include "cmemory.h"
31 #include "cstr.h"
32 #include "cstring.h"
33 #include "uvectr32.h"
34 #include "uvectr64.h"
35 #include "uassert.h"
36 #include "uinvchar.h"
37 
38 #include "regeximp.h"
39 #include "regexcst.h"   // Contains state table for the regex pattern parser.
40                         //   generated by a Perl script.
41 #include "regexcmp.h"
42 #include "regexst.h"
43 #include "regextxt.h"
44 
45 
46 
47 U_NAMESPACE_BEGIN
48 
49 
50 //------------------------------------------------------------------------------
51 //
52 //  Constructor.
53 //
54 //------------------------------------------------------------------------------
RegexCompile(RegexPattern * rxp,UErrorCode & status)55 RegexCompile::RegexCompile(RegexPattern *rxp, UErrorCode &status) :
56    fParenStack(status), fSetStack(status), fSetOpStack(status)
57 {
58     // Lazy init of all shared global sets (needed for init()'s empty text)
59     RegexStaticSets::initGlobals(&status);
60 
61     fStatus           = &status;
62 
63     fRXPat            = rxp;
64     fScanIndex        = 0;
65     fLastChar         = -1;
66     fPeekChar         = -1;
67     fLineNum          = 1;
68     fCharNum          = 0;
69     fQuoteMode        = FALSE;
70     fInBackslashQuote = FALSE;
71     fModeFlags        = fRXPat->fFlags | 0x80000000;
72     fEOLComments      = TRUE;
73 
74     fMatchOpenParen   = -1;
75     fMatchCloseParen  = -1;
76     fCaptureName      = NULL;
77     fLastSetLiteral   = U_SENTINEL;
78 
79     if (U_SUCCESS(status) && U_FAILURE(rxp->fDeferredStatus)) {
80         status = rxp->fDeferredStatus;
81     }
82 }
83 
84 static const UChar      chAmp       = 0x26;      // '&'
85 static const UChar      chDash      = 0x2d;      // '-'
86 
87 
88 //------------------------------------------------------------------------------
89 //
90 //  Destructor
91 //
92 //------------------------------------------------------------------------------
~RegexCompile()93 RegexCompile::~RegexCompile() {
94     delete fCaptureName;         // Normally will be NULL, but can exist if pattern
95                                  //   compilation stops with a syntax error.
96 }
97 
addCategory(UnicodeSet * set,int32_t value,UErrorCode & ec)98 static inline void addCategory(UnicodeSet *set, int32_t value, UErrorCode& ec) {
99     set->addAll(UnicodeSet().applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, value, ec));
100 }
101 
102 //------------------------------------------------------------------------------
103 //
104 //  Compile regex pattern.   The state machine for rexexp pattern parsing is here.
105 //                           The state tables are hand-written in the file regexcst.txt,
106 //                           and converted to the form used here by a perl
107 //                           script regexcst.pl
108 //
109 //------------------------------------------------------------------------------
compile(const UnicodeString & pat,UParseError & pp,UErrorCode & e)110 void    RegexCompile::compile(
111                          const UnicodeString &pat,   // Source pat to be compiled.
112                          UParseError &pp,            // Error position info
113                          UErrorCode &e)              // Error Code
114 {
115     fRXPat->fPatternString = new UnicodeString(pat);
116     UText patternText = UTEXT_INITIALIZER;
117     utext_openConstUnicodeString(&patternText, fRXPat->fPatternString, &e);
118 
119     if (U_SUCCESS(e)) {
120         compile(&patternText, pp, e);
121         utext_close(&patternText);
122     }
123 }
124 
125 //
126 //   compile, UText mode
127 //     All the work is actually done here.
128 //
compile(UText * pat,UParseError & pp,UErrorCode & e)129 void    RegexCompile::compile(
130                          UText *pat,                 // Source pat to be compiled.
131                          UParseError &pp,            // Error position info
132                          UErrorCode &e)              // Error Code
133 {
134     fStatus             = &e;
135     fParseErr           = &pp;
136     fStackPtr           = 0;
137     fStack[fStackPtr]   = 0;
138 
139     if (U_FAILURE(*fStatus)) {
140         return;
141     }
142 
143     // There should be no pattern stuff in the RegexPattern object.  They can not be reused.
144     U_ASSERT(fRXPat->fPattern == NULL || utext_nativeLength(fRXPat->fPattern) == 0);
145 
146     // Prepare the RegexPattern object to receive the compiled pattern.
147     fRXPat->fPattern        = utext_clone(fRXPat->fPattern, pat, FALSE, TRUE, fStatus);
148     if (U_FAILURE(*fStatus)) {
149         return;
150     }
151 
152     // Initialize the pattern scanning state machine
153     fPatternLength = utext_nativeLength(pat);
154     uint16_t                state = 1;
155     const RegexTableEl      *tableEl;
156 
157     // UREGEX_LITERAL force entire pattern to be treated as a literal string.
158     if (fModeFlags & UREGEX_LITERAL) {
159         fQuoteMode = TRUE;
160     }
161 
162     nextChar(fC);                        // Fetch the first char from the pattern string.
163 
164     //
165     // Main loop for the regex pattern parsing state machine.
166     //   Runs once per state transition.
167     //   Each time through optionally performs, depending on the state table,
168     //      - an advance to the the next pattern char
169     //      - an action to be performed.
170     //      - pushing or popping a state to/from the local state return stack.
171     //   file regexcst.txt is the source for the state table.  The logic behind
172     //     recongizing the pattern syntax is there, not here.
173     //
174     for (;;) {
175         //  Bail out if anything has gone wrong.
176         //  Regex pattern parsing stops on the first error encountered.
177         if (U_FAILURE(*fStatus)) {
178             break;
179         }
180 
181         U_ASSERT(state != 0);
182 
183         // Find the state table element that matches the input char from the pattern, or the
184         //    class of the input character.  Start with the first table row for this
185         //    state, then linearly scan forward until we find a row that matches the
186         //    character.  The last row for each state always matches all characters, so
187         //    the search will stop there, if not before.
188         //
189         tableEl = &gRuleParseStateTable[state];
190         REGEX_SCAN_DEBUG_PRINTF(("char, line, col = (\'%c\', %d, %d)    state=%s ",
191             fC.fChar, fLineNum, fCharNum, RegexStateNames[state]));
192 
193         for (;;) {    // loop through table rows belonging to this state, looking for one
194                       //   that matches the current input char.
195             REGEX_SCAN_DEBUG_PRINTF(("."));
196             if (tableEl->fCharClass < 127 && fC.fQuoted == FALSE &&   tableEl->fCharClass == fC.fChar) {
197                 // Table row specified an individual character, not a set, and
198                 //   the input character is not quoted, and
199                 //   the input character matched it.
200                 break;
201             }
202             if (tableEl->fCharClass == 255) {
203                 // Table row specified default, match anything character class.
204                 break;
205             }
206             if (tableEl->fCharClass == 254 && fC.fQuoted)  {
207                 // Table row specified "quoted" and the char was quoted.
208                 break;
209             }
210             if (tableEl->fCharClass == 253 && fC.fChar == (UChar32)-1)  {
211                 // Table row specified eof and we hit eof on the input.
212                 break;
213             }
214 
215             if (tableEl->fCharClass >= 128 && tableEl->fCharClass < 240 &&   // Table specs a char class &&
216                 fC.fQuoted == FALSE &&                                       //   char is not escaped &&
217                 fC.fChar != (UChar32)-1) {                                   //   char is not EOF
218                 U_ASSERT(tableEl->fCharClass <= 137);
219                 if (RegexStaticSets::gStaticSets->fRuleSets[tableEl->fCharClass-128].contains(fC.fChar)) {
220                     // Table row specified a character class, or set of characters,
221                     //   and the current char matches it.
222                     break;
223                 }
224             }
225 
226             // No match on this row, advance to the next  row for this state,
227             tableEl++;
228         }
229         REGEX_SCAN_DEBUG_PRINTF(("\n"));
230 
231         //
232         // We've found the row of the state table that matches the current input
233         //   character from the rules string.
234         // Perform any action specified  by this row in the state table.
235         if (doParseActions(tableEl->fAction) == FALSE) {
236             // Break out of the state machine loop if the
237             //   the action signalled some kind of error, or
238             //   the action was to exit, occurs on normal end-of-rules-input.
239             break;
240         }
241 
242         if (tableEl->fPushState != 0) {
243             fStackPtr++;
244             if (fStackPtr >= kStackSize) {
245                 error(U_REGEX_INTERNAL_ERROR);
246                 REGEX_SCAN_DEBUG_PRINTF(("RegexCompile::parse() - state stack overflow.\n"));
247                 fStackPtr--;
248             }
249             fStack[fStackPtr] = tableEl->fPushState;
250         }
251 
252         //
253         //  NextChar.  This is where characters are actually fetched from the pattern.
254         //             Happens under control of the 'n' tag in the state table.
255         //
256         if (tableEl->fNextChar) {
257             nextChar(fC);
258         }
259 
260         // Get the next state from the table entry, or from the
261         //   state stack if the next state was specified as "pop".
262         if (tableEl->fNextState != 255) {
263             state = tableEl->fNextState;
264         } else {
265             state = fStack[fStackPtr];
266             fStackPtr--;
267             if (fStackPtr < 0) {
268                 // state stack underflow
269                 // This will occur if the user pattern has mis-matched parentheses,
270                 //   with extra close parens.
271                 //
272                 fStackPtr++;
273                 error(U_REGEX_MISMATCHED_PAREN);
274             }
275         }
276 
277     }
278 
279     if (U_FAILURE(*fStatus)) {
280         // Bail out if the pattern had errors.
281         //   Set stack cleanup:  a successful compile would have left it empty,
282         //   but errors can leave temporary sets hanging around.
283         while (!fSetStack.empty()) {
284             delete (UnicodeSet *)fSetStack.pop();
285         }
286         return;
287     }
288 
289     //
290     // The pattern has now been read and processed, and the compiled code generated.
291     //
292 
293     //
294     // The pattern's fFrameSize so far has accumulated the requirements for
295     //   storage for capture parentheses, counters, etc. that are encountered
296     //   in the pattern.  Add space for the two variables that are always
297     //   present in the saved state:  the input string position (int64_t) and
298     //   the position in the compiled pattern.
299     //
300     allocateStackData(RESTACKFRAME_HDRCOUNT);
301 
302     //
303     // Optimization pass 1: NOPs, back-references, and case-folding
304     //
305     stripNOPs();
306 
307     //
308     // Get bounds for the minimum and maximum length of a string that this
309     //   pattern can match.  Used to avoid looking for matches in strings that
310     //   are too short.
311     //
312     fRXPat->fMinMatchLen = minMatchLength(3, fRXPat->fCompiledPat->size()-1);
313 
314     //
315     // Optimization pass 2: match start type
316     //
317     matchStartType();
318 
319     //
320     // Set up fast latin-1 range sets
321     //
322     int32_t numSets = fRXPat->fSets->size();
323     fRXPat->fSets8 = new Regex8BitSet[numSets];
324     // Null pointer check.
325     if (fRXPat->fSets8 == NULL) {
326         e = *fStatus = U_MEMORY_ALLOCATION_ERROR;
327         return;
328     }
329     int32_t i;
330     for (i=0; i<numSets; i++) {
331         UnicodeSet *s = (UnicodeSet *)fRXPat->fSets->elementAt(i);
332         fRXPat->fSets8[i].init(s);
333     }
334 
335 }
336 
337 
338 
339 
340 
341 //------------------------------------------------------------------------------
342 //
343 //  doParseAction        Do some action during regex pattern parsing.
344 //                       Called by the parse state machine.
345 //
346 //                       Generation of the match engine PCode happens here, or
347 //                       in functions called from the parse actions defined here.
348 //
349 //
350 //------------------------------------------------------------------------------
doParseActions(int32_t action)351 UBool RegexCompile::doParseActions(int32_t action)
352 {
353     UBool   returnVal = TRUE;
354 
355     switch ((Regex_PatternParseAction)action) {
356 
357     case doPatStart:
358         // Start of pattern compiles to:
359         //0   SAVE   2        Fall back to position of FAIL
360         //1   jmp    3
361         //2   FAIL            Stop if we ever reach here.
362         //3   NOP             Dummy, so start of pattern looks the same as
363         //                    the start of an ( grouping.
364         //4   NOP             Resreved, will be replaced by a save if there are
365         //                    OR | operators at the top level
366         appendOp(URX_STATE_SAVE, 2);
367         appendOp(URX_JMP,  3);
368         appendOp(URX_FAIL, 0);
369 
370         // Standard open nonCapture paren action emits the two NOPs and
371         //   sets up the paren stack frame.
372         doParseActions(doOpenNonCaptureParen);
373         break;
374 
375     case doPatFinish:
376         // We've scanned to the end of the pattern
377         //  The end of pattern compiles to:
378         //        URX_END
379         //    which will stop the runtime match engine.
380         //  Encountering end of pattern also behaves like a close paren,
381         //   and forces fixups of the State Save at the beginning of the compiled pattern
382         //   and of any OR operations at the top level.
383         //
384         handleCloseParen();
385         if (fParenStack.size() > 0) {
386             // Missing close paren in pattern.
387             error(U_REGEX_MISMATCHED_PAREN);
388         }
389 
390         // add the END operation to the compiled pattern.
391         appendOp(URX_END, 0);
392 
393         // Terminate the pattern compilation state machine.
394         returnVal = FALSE;
395         break;
396 
397 
398 
399     case doOrOperator:
400         // Scanning a '|', as in (A|B)
401         {
402             // Generate code for any pending literals preceding the '|'
403             fixLiterals(FALSE);
404 
405             // Insert a SAVE operation at the start of the pattern section preceding
406             //   this OR at this level.  This SAVE will branch the match forward
407             //   to the right hand side of the OR in the event that the left hand
408             //   side fails to match and backtracks.  Locate the position for the
409             //   save from the location on the top of the parentheses stack.
410             int32_t savePosition = fParenStack.popi();
411             int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(savePosition);
412             U_ASSERT(URX_TYPE(op) == URX_NOP);  // original contents of reserved location
413             op = buildOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+1);
414             fRXPat->fCompiledPat->setElementAt(op, savePosition);
415 
416             // Append an JMP operation into the compiled pattern.  The operand for
417             //  the JMP will eventually be the location following the ')' for the
418             //  group.  This will be patched in later, when the ')' is encountered.
419             appendOp(URX_JMP, 0);
420 
421             // Push the position of the newly added JMP op onto the parentheses stack.
422             // This registers if for fixup when this block's close paren is encountered.
423             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);
424 
425             // Append a NOP to the compiled pattern.  This is the slot reserved
426             //   for a SAVE in the event that there is yet another '|' following
427             //   this one.
428             appendOp(URX_NOP, 0);
429             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);
430         }
431         break;
432 
433 
434     case doBeginNamedCapture:
435         // Scanning (?<letter.
436         //   The first letter of the name will come through again under doConinueNamedCapture.
437         fCaptureName = new UnicodeString();
438         if (fCaptureName == NULL) {
439             error(U_MEMORY_ALLOCATION_ERROR);
440         }
441         break;
442 
443     case  doContinueNamedCapture:
444         fCaptureName->append(fC.fChar);
445         break;
446 
447     case doBadNamedCapture:
448         error(U_REGEX_INVALID_CAPTURE_GROUP_NAME);
449         break;
450 
451     case doOpenCaptureParen:
452         // Open Capturing Paren, possibly named.
453         //   Compile to a
454         //      - NOP, which later may be replaced by a save-state if the
455         //         parenthesized group gets a * quantifier, followed by
456         //      - START_CAPTURE  n    where n is stack frame offset to the capture group variables.
457         //      - NOP, which may later be replaced by a save-state if there
458         //             is an '|' alternation within the parens.
459         //
460         //    Each capture group gets three slots in the save stack frame:
461         //         0: Capture Group start position (in input string being matched.)
462         //         1: Capture Group end position.
463         //         2: Start of Match-in-progress.
464         //    The first two locations are for a completed capture group, and are
465         //     referred to by back references and the like.
466         //    The third location stores the capture start position when an START_CAPTURE is
467         //      encountered.  This will be promoted to a completed capture when (and if) the corresponding
468         //      END_CAPTURE is encountered.
469         {
470             fixLiterals();
471             appendOp(URX_NOP, 0);
472             int32_t  varsLoc = allocateStackData(3);    // Reserve three slots in match stack frame.
473             appendOp(URX_START_CAPTURE, varsLoc);
474             appendOp(URX_NOP, 0);
475 
476             // On the Parentheses stack, start a new frame and add the postions
477             //   of the two NOPs.  Depending on what follows in the pattern, the
478             //   NOPs may be changed to SAVE_STATE or JMP ops, with a target
479             //   address of the end of the parenthesized group.
480             fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
481             fParenStack.push(capturing, *fStatus);                        // Frame type.
482             fParenStack.push(fRXPat->fCompiledPat->size()-3, *fStatus);   // The first  NOP location
483             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP loc
484 
485             // Save the mapping from group number to stack frame variable position.
486             fRXPat->fGroupMap->addElement(varsLoc, *fStatus);
487 
488             // If this is a named capture group, add the name->group number mapping.
489             if (fCaptureName != NULL) {
490                 if (!fRXPat->initNamedCaptureMap()) {
491                     if (U_SUCCESS(*fStatus)) {
492                         error(fRXPat->fDeferredStatus);
493                     }
494                     break;
495                 }
496                 int32_t groupNumber = fRXPat->fGroupMap->size();
497                 int32_t previousMapping = uhash_puti(fRXPat->fNamedCaptureMap, fCaptureName, groupNumber, fStatus);
498                 fCaptureName = NULL;    // hash table takes ownership of the name (key) string.
499                 if (previousMapping > 0 && U_SUCCESS(*fStatus)) {
500                     error(U_REGEX_INVALID_CAPTURE_GROUP_NAME);
501                 }
502             }
503         }
504         break;
505 
506     case doOpenNonCaptureParen:
507         // Open non-caputuring (grouping only) Paren.
508         //   Compile to a
509         //      - NOP, which later may be replaced by a save-state if the
510         //         parenthesized group gets a * quantifier, followed by
511         //      - NOP, which may later be replaced by a save-state if there
512         //             is an '|' alternation within the parens.
513         {
514             fixLiterals();
515             appendOp(URX_NOP, 0);
516             appendOp(URX_NOP, 0);
517 
518             // On the Parentheses stack, start a new frame and add the postions
519             //   of the two NOPs.
520             fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
521             fParenStack.push(plain,      *fStatus);                       // Begin a new frame.
522             fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The first  NOP location
523             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP loc
524         }
525          break;
526 
527 
528     case doOpenAtomicParen:
529         // Open Atomic Paren.  (?>
530         //   Compile to a
531         //      - NOP, which later may be replaced if the parenthesized group
532         //         has a quantifier, followed by
533         //      - STO_SP  save state stack position, so it can be restored at the ")"
534         //      - NOP, which may later be replaced by a save-state if there
535         //             is an '|' alternation within the parens.
536         {
537             fixLiterals();
538             appendOp(URX_NOP, 0);
539             int32_t  varLoc = allocateData(1);    // Reserve a data location for saving the state stack ptr.
540             appendOp(URX_STO_SP, varLoc);
541             appendOp(URX_NOP, 0);
542 
543             // On the Parentheses stack, start a new frame and add the postions
544             //   of the two NOPs.  Depending on what follows in the pattern, the
545             //   NOPs may be changed to SAVE_STATE or JMP ops, with a target
546             //   address of the end of the parenthesized group.
547             fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
548             fParenStack.push(atomic, *fStatus);                           // Frame type.
549             fParenStack.push(fRXPat->fCompiledPat->size()-3, *fStatus);   // The first NOP
550             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP
551         }
552         break;
553 
554 
555     case doOpenLookAhead:
556         // Positive Look-ahead   (?=  stuff  )
557         //
558         //   Note:   Addition of transparent input regions, with the need to
559         //           restore the original regions when failing out of a lookahead
560         //           block, complicated this sequence.  Some conbined opcodes
561         //           might make sense - or might not, lookahead aren't that common.
562         //
563         //      Caution:  min match length optimization knows about this
564         //               sequence; don't change without making updates there too.
565         //
566         // Compiles to
567         //    1    LA_START     dataLoc     Saves SP, Input Pos, Active input region.
568         //    2.   STATE_SAVE   4            on failure of lookahead, goto 4
569         //    3    JMP          6           continue ...
570         //
571         //    4.   LA_END                   Look Ahead failed.  Restore regions.
572         //    5.   BACKTRACK                and back track again.
573         //
574         //    6.   NOP              reserved for use by quantifiers on the block.
575         //                          Look-ahead can't have quantifiers, but paren stack
576         //                             compile time conventions require the slot anyhow.
577         //    7.   NOP              may be replaced if there is are '|' ops in the block.
578         //    8.     code for parenthesized stuff.
579         //    9.   LA_END
580         //
581         //  Four data slots are reserved, for saving state on entry to the look-around
582         //    0:   stack pointer on entry.
583         //    1:   input position on entry.
584         //    2:   fActiveStart, the active bounds start on entry.
585         //    3:   fActiveLimit, the active bounds limit on entry.
586         {
587             fixLiterals();
588             int32_t dataLoc = allocateData(4);
589             appendOp(URX_LA_START, dataLoc);
590             appendOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+ 2);
591             appendOp(URX_JMP, fRXPat->fCompiledPat->size()+ 3);
592             appendOp(URX_LA_END, dataLoc);
593             appendOp(URX_BACKTRACK, 0);
594             appendOp(URX_NOP, 0);
595             appendOp(URX_NOP, 0);
596 
597             // On the Parentheses stack, start a new frame and add the postions
598             //   of the NOPs.
599             fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
600             fParenStack.push(lookAhead, *fStatus);                        // Frame type.
601             fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The first  NOP location
602             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP location
603         }
604         break;
605 
606     case doOpenLookAheadNeg:
607         // Negated Lookahead.   (?! stuff )
608         // Compiles to
609         //    1.    LA_START    dataloc
610         //    2.    SAVE_STATE  7         // Fail within look-ahead block restores to this state,
611         //                                //   which continues with the match.
612         //    3.    NOP                   // Std. Open Paren sequence, for possible '|'
613         //    4.       code for parenthesized stuff.
614         //    5.    LA_END                // Cut back stack, remove saved state from step 2.
615         //    6.    BACKTRACK             // code in block succeeded, so neg. lookahead fails.
616         //    7.    END_LA                // Restore match region, in case look-ahead was using
617         //                                        an alternate (transparent) region.
618         //  Four data slots are reserved, for saving state on entry to the look-around
619         //    0:   stack pointer on entry.
620         //    1:   input position on entry.
621         //    2:   fActiveStart, the active bounds start on entry.
622         //    3:   fActiveLimit, the active bounds limit on entry.
623         {
624             fixLiterals();
625             int32_t dataLoc = allocateData(4);
626             appendOp(URX_LA_START, dataLoc);
627             appendOp(URX_STATE_SAVE, 0);    // dest address will be patched later.
628             appendOp(URX_NOP, 0);
629 
630             // On the Parentheses stack, start a new frame and add the postions
631             //   of the StateSave and NOP.
632             fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
633             fParenStack.push(negLookAhead, *fStatus);                    // Frame type
634             fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The STATE_SAVE location
635             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP location
636 
637             // Instructions #5 - #7 will be added when the ')' is encountered.
638         }
639         break;
640 
641     case doOpenLookBehind:
642         {
643             //   Compile a (?<= look-behind open paren.
644             //
645             //          Compiles to
646             //              0       URX_LB_START     dataLoc
647             //              1       URX_LB_CONT      dataLoc
648             //              2                        MinMatchLen
649             //              3                        MaxMatchLen
650             //              4       URX_NOP          Standard '(' boilerplate.
651             //              5       URX_NOP          Reserved slot for use with '|' ops within (block).
652             //              6         <code for LookBehind expression>
653             //              7       URX_LB_END       dataLoc    # Check match len, restore input  len
654             //              8       URX_LA_END       dataLoc    # Restore stack, input pos
655             //
656             //          Allocate a block of matcher data, to contain (when running a match)
657             //              0:    Stack ptr on entry
658             //              1:    Input Index on entry
659             //              2:    fActiveStart, the active bounds start on entry.
660             //              3:    fActiveLimit, the active bounds limit on entry.
661             //              4:    Start index of match current match attempt.
662             //          The first four items must match the layout of data for LA_START / LA_END
663 
664             // Generate match code for any pending literals.
665             fixLiterals();
666 
667             // Allocate data space
668             int32_t dataLoc = allocateData(5);
669 
670             // Emit URX_LB_START
671             appendOp(URX_LB_START, dataLoc);
672 
673             // Emit URX_LB_CONT
674             appendOp(URX_LB_CONT, dataLoc);
675             appendOp(URX_RESERVED_OP, 0);    // MinMatchLength.  To be filled later.
676             appendOp(URX_RESERVED_OP, 0);    // MaxMatchLength.  To be filled later.
677 
678             // Emit the NOPs
679             appendOp(URX_NOP, 0);
680             appendOp(URX_NOP, 0);
681 
682             // On the Parentheses stack, start a new frame and add the postions
683             //   of the URX_LB_CONT and the NOP.
684             fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
685             fParenStack.push(lookBehind, *fStatus);                       // Frame type
686             fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The first NOP location
687             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The 2nd   NOP location
688 
689             // The final two instructions will be added when the ')' is encountered.
690         }
691 
692         break;
693 
694     case doOpenLookBehindNeg:
695         {
696             //   Compile a (?<! negated look-behind open paren.
697             //
698             //          Compiles to
699             //              0       URX_LB_START     dataLoc    # Save entry stack, input len
700             //              1       URX_LBN_CONT     dataLoc    # Iterate possible match positions
701             //              2                        MinMatchLen
702             //              3                        MaxMatchLen
703             //              4                        continueLoc (9)
704             //              5       URX_NOP          Standard '(' boilerplate.
705             //              6       URX_NOP          Reserved slot for use with '|' ops within (block).
706             //              7         <code for LookBehind expression>
707             //              8       URX_LBN_END      dataLoc    # Check match len, cause a FAIL
708             //              9       ...
709             //
710             //          Allocate a block of matcher data, to contain (when running a match)
711             //              0:    Stack ptr on entry
712             //              1:    Input Index on entry
713             //              2:    fActiveStart, the active bounds start on entry.
714             //              3:    fActiveLimit, the active bounds limit on entry.
715             //              4:    Start index of match current match attempt.
716             //          The first four items must match the layout of data for LA_START / LA_END
717 
718             // Generate match code for any pending literals.
719             fixLiterals();
720 
721             // Allocate data space
722             int32_t dataLoc = allocateData(5);
723 
724             // Emit URX_LB_START
725             appendOp(URX_LB_START, dataLoc);
726 
727             // Emit URX_LBN_CONT
728             appendOp(URX_LBN_CONT, dataLoc);
729             appendOp(URX_RESERVED_OP, 0);    // MinMatchLength.  To be filled later.
730             appendOp(URX_RESERVED_OP, 0);    // MaxMatchLength.  To be filled later.
731             appendOp(URX_RESERVED_OP, 0);    // Continue Loc.    To be filled later.
732 
733             // Emit the NOPs
734             appendOp(URX_NOP, 0);
735             appendOp(URX_NOP, 0);
736 
737             // On the Parentheses stack, start a new frame and add the postions
738             //   of the URX_LB_CONT and the NOP.
739             fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
740             fParenStack.push(lookBehindN, *fStatus);                      // Frame type
741             fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The first NOP location
742             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The 2nd   NOP location
743 
744             // The final two instructions will be added when the ')' is encountered.
745         }
746         break;
747 
748     case doConditionalExpr:
749         // Conditionals such as (?(1)a:b)
750     case doPerlInline:
751         // Perl inline-condtionals.  (?{perl code}a|b) We're not perl, no way to do them.
752         error(U_REGEX_UNIMPLEMENTED);
753         break;
754 
755 
756     case doCloseParen:
757         handleCloseParen();
758         if (fParenStack.size() <= 0) {
759             //  Extra close paren, or missing open paren.
760             error(U_REGEX_MISMATCHED_PAREN);
761         }
762         break;
763 
764     case doNOP:
765         break;
766 
767 
768     case doBadOpenParenType:
769     case doRuleError:
770         error(U_REGEX_RULE_SYNTAX);
771         break;
772 
773 
774     case doMismatchedParenErr:
775         error(U_REGEX_MISMATCHED_PAREN);
776         break;
777 
778     case doPlus:
779         //  Normal '+'  compiles to
780         //     1.   stuff to be repeated  (already built)
781         //     2.   jmp-sav 1
782         //     3.   ...
783         //
784         //  Or, if the item to be repeated can match a zero length string,
785         //     1.   STO_INP_LOC  data-loc
786         //     2.      body of stuff to be repeated
787         //     3.   JMP_SAV_X    2
788         //     4.   ...
789 
790         //
791         //  Or, if the item to be repeated is simple
792         //     1.   Item to be repeated.
793         //     2.   LOOP_SR_I    set number  (assuming repeated item is a set ref)
794         //     3.   LOOP_C       stack location
795         {
796             int32_t  topLoc = blockTopLoc(FALSE);        // location of item #1
797             int32_t  frameLoc;
798 
799             // Check for simple constructs, which may get special optimized code.
800             if (topLoc == fRXPat->fCompiledPat->size() - 1) {
801                 int32_t repeatedOp = (int32_t)fRXPat->fCompiledPat->elementAti(topLoc);
802 
803                 if (URX_TYPE(repeatedOp) == URX_SETREF) {
804                     // Emit optimized code for [char set]+
805                     appendOp(URX_LOOP_SR_I, URX_VAL(repeatedOp));
806                     frameLoc = allocateStackData(1);
807                     appendOp(URX_LOOP_C, frameLoc);
808                     break;
809                 }
810 
811                 if (URX_TYPE(repeatedOp) == URX_DOTANY ||
812                     URX_TYPE(repeatedOp) == URX_DOTANY_ALL ||
813                     URX_TYPE(repeatedOp) == URX_DOTANY_UNIX) {
814                     // Emit Optimized code for .+ operations.
815                     int32_t loopOpI = buildOp(URX_LOOP_DOT_I, 0);
816                     if (URX_TYPE(repeatedOp) == URX_DOTANY_ALL) {
817                         // URX_LOOP_DOT_I operand is a flag indicating ". matches any" mode.
818                         loopOpI |= 1;
819                     }
820                     if (fModeFlags & UREGEX_UNIX_LINES) {
821                         loopOpI |= 2;
822                     }
823                     appendOp(loopOpI);
824                     frameLoc = allocateStackData(1);
825                     appendOp(URX_LOOP_C, frameLoc);
826                     break;
827                 }
828 
829             }
830 
831             // General case.
832 
833             // Check for minimum match length of zero, which requires
834             //    extra loop-breaking code.
835             if (minMatchLength(topLoc, fRXPat->fCompiledPat->size()-1) == 0) {
836                 // Zero length match is possible.
837                 // Emit the code sequence that can handle it.
838                 insertOp(topLoc);
839                 frameLoc = allocateStackData(1);
840 
841                 int32_t op = buildOp(URX_STO_INP_LOC, frameLoc);
842                 fRXPat->fCompiledPat->setElementAt(op, topLoc);
843 
844                 appendOp(URX_JMP_SAV_X, topLoc+1);
845             } else {
846                 // Simpler code when the repeated body must match something non-empty
847                 appendOp(URX_JMP_SAV, topLoc);
848             }
849         }
850         break;
851 
852     case doNGPlus:
853         //  Non-greedy '+?'  compiles to
854         //     1.   stuff to be repeated  (already built)
855         //     2.   state-save  1
856         //     3.   ...
857         {
858             int32_t topLoc      = blockTopLoc(FALSE);
859             appendOp(URX_STATE_SAVE, topLoc);
860         }
861         break;
862 
863 
864     case doOpt:
865         // Normal (greedy) ? quantifier.
866         //  Compiles to
867         //     1. state save 3
868         //     2.    body of optional block
869         //     3. ...
870         // Insert the state save into the compiled pattern, and we're done.
871         {
872             int32_t   saveStateLoc = blockTopLoc(TRUE);
873             int32_t   saveStateOp  = buildOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size());
874             fRXPat->fCompiledPat->setElementAt(saveStateOp, saveStateLoc);
875         }
876         break;
877 
878     case doNGOpt:
879         // Non-greedy ?? quantifier
880         //   compiles to
881         //    1.  jmp   4
882         //    2.     body of optional block
883         //    3   jmp   5
884         //    4.  state save 2
885         //    5    ...
886         //  This code is less than ideal, with two jmps instead of one, because we can only
887         //  insert one instruction at the top of the block being iterated.
888         {
889             int32_t  jmp1_loc = blockTopLoc(TRUE);
890             int32_t  jmp2_loc = fRXPat->fCompiledPat->size();
891 
892             int32_t  jmp1_op  = buildOp(URX_JMP, jmp2_loc+1);
893             fRXPat->fCompiledPat->setElementAt(jmp1_op, jmp1_loc);
894 
895             appendOp(URX_JMP, jmp2_loc+2);
896 
897             appendOp(URX_STATE_SAVE, jmp1_loc+1);
898         }
899         break;
900 
901 
902     case doStar:
903         // Normal (greedy) * quantifier.
904         // Compiles to
905         //       1.   STATE_SAVE   4
906         //       2.      body of stuff being iterated over
907         //       3.   JMP_SAV      2
908         //       4.   ...
909         //
910         // Or, if the body is a simple [Set],
911         //       1.   LOOP_SR_I    set number
912         //       2.   LOOP_C       stack location
913         //       ...
914         //
915         // Or if this is a .*
916         //       1.   LOOP_DOT_I    (. matches all mode flag)
917         //       2.   LOOP_C        stack location
918         //
919         // Or, if the body can match a zero-length string, to inhibit infinite loops,
920         //       1.   STATE_SAVE   5
921         //       2.   STO_INP_LOC  data-loc
922         //       3.      body of stuff
923         //       4.   JMP_SAV_X    2
924         //       5.   ...
925         {
926             // location of item #1, the STATE_SAVE
927             int32_t   topLoc = blockTopLoc(FALSE);
928             int32_t   dataLoc = -1;
929 
930             // Check for simple *, where the construct being repeated
931             //   compiled to single opcode, and might be optimizable.
932             if (topLoc == fRXPat->fCompiledPat->size() - 1) {
933                 int32_t repeatedOp = (int32_t)fRXPat->fCompiledPat->elementAti(topLoc);
934 
935                 if (URX_TYPE(repeatedOp) == URX_SETREF) {
936                     // Emit optimized code for a [char set]*
937                     int32_t loopOpI = buildOp(URX_LOOP_SR_I, URX_VAL(repeatedOp));
938                     fRXPat->fCompiledPat->setElementAt(loopOpI, topLoc);
939                     dataLoc = allocateStackData(1);
940                     appendOp(URX_LOOP_C, dataLoc);
941                     break;
942                 }
943 
944                 if (URX_TYPE(repeatedOp) == URX_DOTANY ||
945                     URX_TYPE(repeatedOp) == URX_DOTANY_ALL ||
946                     URX_TYPE(repeatedOp) == URX_DOTANY_UNIX) {
947                     // Emit Optimized code for .* operations.
948                     int32_t loopOpI = buildOp(URX_LOOP_DOT_I, 0);
949                     if (URX_TYPE(repeatedOp) == URX_DOTANY_ALL) {
950                         // URX_LOOP_DOT_I operand is a flag indicating . matches any mode.
951                         loopOpI |= 1;
952                     }
953                     if ((fModeFlags & UREGEX_UNIX_LINES) != 0) {
954                         loopOpI |= 2;
955                     }
956                     fRXPat->fCompiledPat->setElementAt(loopOpI, topLoc);
957                     dataLoc = allocateStackData(1);
958                     appendOp(URX_LOOP_C, dataLoc);
959                     break;
960                 }
961             }
962 
963             // Emit general case code for this *
964             // The optimizations did not apply.
965 
966             int32_t   saveStateLoc = blockTopLoc(TRUE);
967             int32_t   jmpOp        = buildOp(URX_JMP_SAV, saveStateLoc+1);
968 
969             // Check for minimum match length of zero, which requires
970             //    extra loop-breaking code.
971             if (minMatchLength(saveStateLoc, fRXPat->fCompiledPat->size()-1) == 0) {
972                 insertOp(saveStateLoc);
973                 dataLoc = allocateStackData(1);
974 
975                 int32_t op = buildOp(URX_STO_INP_LOC, dataLoc);
976                 fRXPat->fCompiledPat->setElementAt(op, saveStateLoc+1);
977                 jmpOp      = buildOp(URX_JMP_SAV_X, saveStateLoc+2);
978             }
979 
980             // Locate the position in the compiled pattern where the match will continue
981             //   after completing the *.   (4 or 5 in the comment above)
982             int32_t continueLoc = fRXPat->fCompiledPat->size()+1;
983 
984             // Put together the save state op and store it into the compiled code.
985             int32_t saveStateOp = buildOp(URX_STATE_SAVE, continueLoc);
986             fRXPat->fCompiledPat->setElementAt(saveStateOp, saveStateLoc);
987 
988             // Append the URX_JMP_SAV or URX_JMPX operation to the compiled pattern.
989             appendOp(jmpOp);
990         }
991         break;
992 
993     case doNGStar:
994         // Non-greedy *? quantifier
995         // compiles to
996         //     1.   JMP    3
997         //     2.      body of stuff being iterated over
998         //     3.   STATE_SAVE  2
999         //     4    ...
1000         {
1001             int32_t     jmpLoc  = blockTopLoc(TRUE);                   // loc  1.
1002             int32_t     saveLoc = fRXPat->fCompiledPat->size();        // loc  3.
1003             int32_t     jmpOp   = buildOp(URX_JMP, saveLoc);
1004             fRXPat->fCompiledPat->setElementAt(jmpOp, jmpLoc);
1005             appendOp(URX_STATE_SAVE, jmpLoc+1);
1006         }
1007         break;
1008 
1009 
1010     case doIntervalInit:
1011         // The '{' opening an interval quantifier was just scanned.
1012         // Init the counter varaiables that will accumulate the values as the digits
1013         //    are scanned.
1014         fIntervalLow = 0;
1015         fIntervalUpper = -1;
1016         break;
1017 
1018     case doIntevalLowerDigit:
1019         // Scanned a digit from the lower value of an {lower,upper} interval
1020         {
1021             int32_t digitValue = u_charDigitValue(fC.fChar);
1022             U_ASSERT(digitValue >= 0);
1023             int64_t val = (int64_t)fIntervalLow*10 + digitValue;
1024             if (val > INT32_MAX) {
1025                 error(U_REGEX_NUMBER_TOO_BIG);
1026             } else {
1027                 fIntervalLow = (int32_t)val;
1028             }
1029         }
1030         break;
1031 
1032     case doIntervalUpperDigit:
1033         // Scanned a digit from the upper value of an {lower,upper} interval
1034         {
1035             if (fIntervalUpper < 0) {
1036                 fIntervalUpper = 0;
1037             }
1038             int32_t digitValue = u_charDigitValue(fC.fChar);
1039             U_ASSERT(digitValue >= 0);
1040             int64_t val = (int64_t)fIntervalUpper*10 + digitValue;
1041             if (val > INT32_MAX) {
1042                 error(U_REGEX_NUMBER_TOO_BIG);
1043             } else {
1044                 fIntervalUpper = (int32_t)val;
1045             }
1046         }
1047         break;
1048 
1049     case doIntervalSame:
1050         // Scanned a single value interval like {27}.  Upper = Lower.
1051         fIntervalUpper = fIntervalLow;
1052         break;
1053 
1054     case doInterval:
1055         // Finished scanning a normal {lower,upper} interval.  Generate the code for it.
1056         if (compileInlineInterval() == FALSE) {
1057             compileInterval(URX_CTR_INIT, URX_CTR_LOOP);
1058         }
1059         break;
1060 
1061     case doPossessiveInterval:
1062         // Finished scanning a Possessive {lower,upper}+ interval.  Generate the code for it.
1063         {
1064             // Remember the loc for the top of the block being looped over.
1065             //   (Can not reserve a slot in the compiled pattern at this time, because
1066             //    compileInterval needs to reserve also, and blockTopLoc can only reserve
1067             //    once per block.)
1068             int32_t topLoc = blockTopLoc(FALSE);
1069 
1070             // Produce normal looping code.
1071             compileInterval(URX_CTR_INIT, URX_CTR_LOOP);
1072 
1073             // Surround the just-emitted normal looping code with a STO_SP ... LD_SP
1074             //  just as if the loop was inclosed in atomic parentheses.
1075 
1076             // First the STO_SP before the start of the loop
1077             insertOp(topLoc);
1078 
1079             int32_t  varLoc = allocateData(1);   // Reserve a data location for saving the
1080             int32_t  op     = buildOp(URX_STO_SP, varLoc);
1081             fRXPat->fCompiledPat->setElementAt(op, topLoc);
1082 
1083             int32_t loopOp = (int32_t)fRXPat->fCompiledPat->popi();
1084             U_ASSERT(URX_TYPE(loopOp) == URX_CTR_LOOP && URX_VAL(loopOp) == topLoc);
1085             loopOp++;     // point LoopOp after the just-inserted STO_SP
1086             fRXPat->fCompiledPat->push(loopOp, *fStatus);
1087 
1088             // Then the LD_SP after the end of the loop
1089             appendOp(URX_LD_SP, varLoc);
1090         }
1091 
1092         break;
1093 
1094     case doNGInterval:
1095         // Finished scanning a non-greedy {lower,upper}? interval.  Generate the code for it.
1096         compileInterval(URX_CTR_INIT_NG, URX_CTR_LOOP_NG);
1097         break;
1098 
1099     case doIntervalError:
1100         error(U_REGEX_BAD_INTERVAL);
1101         break;
1102 
1103     case doLiteralChar:
1104         // We've just scanned a "normal" character from the pattern,
1105         literalChar(fC.fChar);
1106         break;
1107 
1108 
1109     case doEscapedLiteralChar:
1110         // We've just scanned an backslashed escaped character with  no
1111         //   special meaning.  It represents itself.
1112         if ((fModeFlags & UREGEX_ERROR_ON_UNKNOWN_ESCAPES) != 0 &&
1113             ((fC.fChar >= 0x41 && fC.fChar<= 0x5A) ||     // in [A-Z]
1114             (fC.fChar >= 0x61 && fC.fChar <= 0x7a))) {   // in [a-z]
1115                error(U_REGEX_BAD_ESCAPE_SEQUENCE);
1116              }
1117         literalChar(fC.fChar);
1118         break;
1119 
1120 
1121     case doDotAny:
1122         // scanned a ".",  match any single character.
1123         {
1124             fixLiterals(FALSE);
1125             if (fModeFlags & UREGEX_DOTALL) {
1126                 appendOp(URX_DOTANY_ALL, 0);
1127             } else if (fModeFlags & UREGEX_UNIX_LINES) {
1128                 appendOp(URX_DOTANY_UNIX, 0);
1129             } else {
1130                 appendOp(URX_DOTANY, 0);
1131             }
1132         }
1133         break;
1134 
1135     case doCaret:
1136         {
1137             fixLiterals(FALSE);
1138             if (       (fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) {
1139                 appendOp(URX_CARET, 0);
1140             } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) {
1141                 appendOp(URX_CARET_M, 0);
1142             } else if ((fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) {
1143                 appendOp(URX_CARET, 0);   // Only testing true start of input.
1144             } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) {
1145                 appendOp(URX_CARET_M_UNIX, 0);
1146             }
1147         }
1148         break;
1149 
1150     case doDollar:
1151         {
1152             fixLiterals(FALSE);
1153             if (       (fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) {
1154                 appendOp(URX_DOLLAR, 0);
1155             } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) {
1156                 appendOp(URX_DOLLAR_M, 0);
1157             } else if ((fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) {
1158                 appendOp(URX_DOLLAR_D, 0);
1159             } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) {
1160                 appendOp(URX_DOLLAR_MD, 0);
1161             }
1162         }
1163         break;
1164 
1165     case doBackslashA:
1166         fixLiterals(FALSE);
1167         appendOp(URX_CARET, 0);
1168         break;
1169 
1170     case doBackslashB:
1171         {
1172             #if  UCONFIG_NO_BREAK_ITERATION==1
1173             if (fModeFlags & UREGEX_UWORD) {
1174                 error(U_UNSUPPORTED_ERROR);
1175             }
1176             #endif
1177             fixLiterals(FALSE);
1178             int32_t op = (fModeFlags & UREGEX_UWORD)? URX_BACKSLASH_BU : URX_BACKSLASH_B;
1179             appendOp(op, 1);
1180         }
1181         break;
1182 
1183     case doBackslashb:
1184         {
1185             #if  UCONFIG_NO_BREAK_ITERATION==1
1186             if (fModeFlags & UREGEX_UWORD) {
1187                 error(U_UNSUPPORTED_ERROR);
1188             }
1189             #endif
1190             fixLiterals(FALSE);
1191             int32_t op = (fModeFlags & UREGEX_UWORD)? URX_BACKSLASH_BU : URX_BACKSLASH_B;
1192             appendOp(op, 0);
1193         }
1194         break;
1195 
1196     case doBackslashD:
1197         fixLiterals(FALSE);
1198         appendOp(URX_BACKSLASH_D, 1);
1199         break;
1200 
1201     case doBackslashd:
1202         fixLiterals(FALSE);
1203         appendOp(URX_BACKSLASH_D, 0);
1204         break;
1205 
1206     case doBackslashG:
1207         fixLiterals(FALSE);
1208         appendOp(URX_BACKSLASH_G, 0);
1209         break;
1210 
1211     case doBackslashH:
1212         fixLiterals(FALSE);
1213         appendOp(URX_BACKSLASH_H, 1);
1214         break;
1215 
1216     case doBackslashh:
1217         fixLiterals(FALSE);
1218         appendOp(URX_BACKSLASH_H, 0);
1219         break;
1220 
1221     case doBackslashR:
1222         fixLiterals(FALSE);
1223         appendOp(URX_BACKSLASH_R, 0);
1224         break;
1225 
1226     case doBackslashS:
1227         fixLiterals(FALSE);
1228         appendOp(URX_STAT_SETREF_N, URX_ISSPACE_SET);
1229         break;
1230 
1231     case doBackslashs:
1232         fixLiterals(FALSE);
1233         appendOp(URX_STATIC_SETREF, URX_ISSPACE_SET);
1234         break;
1235 
1236     case doBackslashV:
1237         fixLiterals(FALSE);
1238         appendOp(URX_BACKSLASH_V, 1);
1239         break;
1240 
1241     case doBackslashv:
1242         fixLiterals(FALSE);
1243         appendOp(URX_BACKSLASH_V, 0);
1244         break;
1245 
1246     case doBackslashW:
1247         fixLiterals(FALSE);
1248         appendOp(URX_STAT_SETREF_N, URX_ISWORD_SET);
1249         break;
1250 
1251     case doBackslashw:
1252         fixLiterals(FALSE);
1253         appendOp(URX_STATIC_SETREF, URX_ISWORD_SET);
1254         break;
1255 
1256     case doBackslashX:
1257         #if  UCONFIG_NO_BREAK_ITERATION==1
1258         // Grapheme Cluster Boundary requires ICU break iteration.
1259         error(U_UNSUPPORTED_ERROR);
1260         #endif
1261         fixLiterals(FALSE);
1262         appendOp(URX_BACKSLASH_X, 0);
1263         break;
1264 
1265     case doBackslashZ:
1266         fixLiterals(FALSE);
1267         appendOp(URX_DOLLAR, 0);
1268         break;
1269 
1270     case doBackslashz:
1271         fixLiterals(FALSE);
1272         appendOp(URX_BACKSLASH_Z, 0);
1273         break;
1274 
1275     case doEscapeError:
1276         error(U_REGEX_BAD_ESCAPE_SEQUENCE);
1277         break;
1278 
1279     case doExit:
1280         fixLiterals(FALSE);
1281         returnVal = FALSE;
1282         break;
1283 
1284     case doProperty:
1285         {
1286             fixLiterals(FALSE);
1287             UnicodeSet *theSet = scanProp();
1288             compileSet(theSet);
1289         }
1290         break;
1291 
1292     case doNamedChar:
1293         {
1294             UChar32 c = scanNamedChar();
1295             literalChar(c);
1296         }
1297         break;
1298 
1299 
1300     case doBackRef:
1301         // BackReference.  Somewhat unusual in that the front-end can not completely parse
1302         //                 the regular expression, because the number of digits to be consumed
1303         //                 depends on the number of capture groups that have been defined.  So
1304         //                 we have to do it here instead.
1305         {
1306             int32_t  numCaptureGroups = fRXPat->fGroupMap->size();
1307             int32_t  groupNum = 0;
1308             UChar32  c        = fC.fChar;
1309 
1310             for (;;) {
1311                 // Loop once per digit, for max allowed number of digits in a back reference.
1312                 int32_t digit = u_charDigitValue(c);
1313                 groupNum = groupNum * 10 + digit;
1314                 if (groupNum >= numCaptureGroups) {
1315                     break;
1316                 }
1317                 c = peekCharLL();
1318                 if (RegexStaticSets::gStaticSets->fRuleDigitsAlias->contains(c) == FALSE) {
1319                     break;
1320                 }
1321                 nextCharLL();
1322             }
1323 
1324             // Scan of the back reference in the source regexp is complete.  Now generate
1325             //  the compiled code for it.
1326             // Because capture groups can be forward-referenced by back-references,
1327             //  we fill the operand with the capture group number.  At the end
1328             //  of compilation, it will be changed to the variable's location.
1329             U_ASSERT(groupNum > 0);  // Shouldn't happen.  '\0' begins an octal escape sequence,
1330                                      //    and shouldn't enter this code path at all.
1331             fixLiterals(FALSE);
1332             if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
1333                 appendOp(URX_BACKREF_I, groupNum);
1334             } else {
1335                 appendOp(URX_BACKREF, groupNum);
1336             }
1337         }
1338         break;
1339 
1340     case doBeginNamedBackRef:
1341         U_ASSERT(fCaptureName == NULL);
1342         fCaptureName = new UnicodeString;
1343         if (fCaptureName == NULL) {
1344             error(U_MEMORY_ALLOCATION_ERROR);
1345         }
1346         break;
1347 
1348     case doContinueNamedBackRef:
1349         fCaptureName->append(fC.fChar);
1350         break;
1351 
1352     case doCompleteNamedBackRef:
1353         {
1354         int32_t groupNumber =
1355             fRXPat->fNamedCaptureMap ? uhash_geti(fRXPat->fNamedCaptureMap, fCaptureName) : 0;
1356         if (groupNumber == 0) {
1357             // Group name has not been defined.
1358             //   Could be a forward reference. If we choose to support them at some
1359             //   future time, extra mechanism will be required at this point.
1360             error(U_REGEX_INVALID_CAPTURE_GROUP_NAME);
1361         } else {
1362             // Given the number, handle identically to a \n numbered back reference.
1363             // See comments above, under doBackRef
1364             fixLiterals(FALSE);
1365             if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
1366                 appendOp(URX_BACKREF_I, groupNumber);
1367             } else {
1368                 appendOp(URX_BACKREF, groupNumber);
1369             }
1370         }
1371         delete fCaptureName;
1372         fCaptureName = NULL;
1373         break;
1374         }
1375 
1376     case doPossessivePlus:
1377         // Possessive ++ quantifier.
1378         // Compiles to
1379         //       1.   STO_SP
1380         //       2.      body of stuff being iterated over
1381         //       3.   STATE_SAVE 5
1382         //       4.   JMP        2
1383         //       5.   LD_SP
1384         //       6.   ...
1385         //
1386         //  Note:  TODO:  This is pretty inefficient.  A mass of saved state is built up
1387         //                then unconditionally discarded.  Perhaps introduce a new opcode.  Ticket 6056
1388         //
1389         {
1390             // Emit the STO_SP
1391             int32_t   topLoc = blockTopLoc(TRUE);
1392             int32_t   stoLoc = allocateData(1);  // Reserve the data location for storing save stack ptr.
1393             int32_t   op     = buildOp(URX_STO_SP, stoLoc);
1394             fRXPat->fCompiledPat->setElementAt(op, topLoc);
1395 
1396             // Emit the STATE_SAVE
1397             appendOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+2);
1398 
1399             // Emit the JMP
1400             appendOp(URX_JMP, topLoc+1);
1401 
1402             // Emit the LD_SP
1403             appendOp(URX_LD_SP, stoLoc);
1404         }
1405         break;
1406 
1407     case doPossessiveStar:
1408         // Possessive *+ quantifier.
1409         // Compiles to
1410         //       1.   STO_SP       loc
1411         //       2.   STATE_SAVE   5
1412         //       3.      body of stuff being iterated over
1413         //       4.   JMP          2
1414         //       5.   LD_SP        loc
1415         //       6    ...
1416         // TODO:  do something to cut back the state stack each time through the loop.
1417         {
1418             // Reserve two slots at the top of the block.
1419             int32_t   topLoc = blockTopLoc(TRUE);
1420             insertOp(topLoc);
1421 
1422             // emit   STO_SP     loc
1423             int32_t   stoLoc = allocateData(1);    // Reserve the data location for storing save stack ptr.
1424             int32_t   op     = buildOp(URX_STO_SP, stoLoc);
1425             fRXPat->fCompiledPat->setElementAt(op, topLoc);
1426 
1427             // Emit the SAVE_STATE   5
1428             int32_t L7 = fRXPat->fCompiledPat->size()+1;
1429             op = buildOp(URX_STATE_SAVE, L7);
1430             fRXPat->fCompiledPat->setElementAt(op, topLoc+1);
1431 
1432             // Append the JMP operation.
1433             appendOp(URX_JMP, topLoc+1);
1434 
1435             // Emit the LD_SP       loc
1436             appendOp(URX_LD_SP, stoLoc);
1437         }
1438         break;
1439 
1440     case doPossessiveOpt:
1441         // Possessive  ?+ quantifier.
1442         //  Compiles to
1443         //     1. STO_SP      loc
1444         //     2. SAVE_STATE  5
1445         //     3.    body of optional block
1446         //     4. LD_SP       loc
1447         //     5. ...
1448         //
1449         {
1450             // Reserve two slots at the top of the block.
1451             int32_t   topLoc = blockTopLoc(TRUE);
1452             insertOp(topLoc);
1453 
1454             // Emit the STO_SP
1455             int32_t   stoLoc = allocateData(1);   // Reserve the data location for storing save stack ptr.
1456             int32_t   op     = buildOp(URX_STO_SP, stoLoc);
1457             fRXPat->fCompiledPat->setElementAt(op, topLoc);
1458 
1459             // Emit the SAVE_STATE
1460             int32_t   continueLoc = fRXPat->fCompiledPat->size()+1;
1461             op = buildOp(URX_STATE_SAVE, continueLoc);
1462             fRXPat->fCompiledPat->setElementAt(op, topLoc+1);
1463 
1464             // Emit the LD_SP
1465             appendOp(URX_LD_SP, stoLoc);
1466         }
1467         break;
1468 
1469 
1470     case doBeginMatchMode:
1471         fNewModeFlags = fModeFlags;
1472         fSetModeFlag  = TRUE;
1473         break;
1474 
1475     case doMatchMode:   //  (?i)    and similar
1476         {
1477             int32_t  bit = 0;
1478             switch (fC.fChar) {
1479             case 0x69: /* 'i' */   bit = UREGEX_CASE_INSENSITIVE; break;
1480             case 0x64: /* 'd' */   bit = UREGEX_UNIX_LINES;       break;
1481             case 0x6d: /* 'm' */   bit = UREGEX_MULTILINE;        break;
1482             case 0x73: /* 's' */   bit = UREGEX_DOTALL;           break;
1483             case 0x75: /* 'u' */   bit = 0; /* Unicode casing */  break;
1484             case 0x77: /* 'w' */   bit = UREGEX_UWORD;            break;
1485             case 0x78: /* 'x' */   bit = UREGEX_COMMENTS;         break;
1486             case 0x2d: /* '-' */   fSetModeFlag = FALSE;          break;
1487             default:
1488                 UPRV_UNREACHABLE;   // Should never happen.  Other chars are filtered out
1489                                    // by the scanner.
1490             }
1491             if (fSetModeFlag) {
1492                 fNewModeFlags |= bit;
1493             } else {
1494                 fNewModeFlags &= ~bit;
1495             }
1496         }
1497         break;
1498 
1499     case doSetMatchMode:
1500         // Emit code to match any pending literals, using the not-yet changed match mode.
1501         fixLiterals();
1502 
1503         // We've got a (?i) or similar.  The match mode is being changed, but
1504         //   the change is not scoped to a parenthesized block.
1505         U_ASSERT(fNewModeFlags < 0);
1506         fModeFlags = fNewModeFlags;
1507 
1508         break;
1509 
1510 
1511     case doMatchModeParen:
1512         // We've got a (?i: or similar.  Begin a parenthesized block, save old
1513         //   mode flags so they can be restored at the close of the block.
1514         //
1515         //   Compile to a
1516         //      - NOP, which later may be replaced by a save-state if the
1517         //         parenthesized group gets a * quantifier, followed by
1518         //      - NOP, which may later be replaced by a save-state if there
1519         //             is an '|' alternation within the parens.
1520         {
1521             fixLiterals(FALSE);
1522             appendOp(URX_NOP, 0);
1523             appendOp(URX_NOP, 0);
1524 
1525             // On the Parentheses stack, start a new frame and add the postions
1526             //   of the two NOPs (a normal non-capturing () frame, except for the
1527             //   saving of the orignal mode flags.)
1528             fParenStack.push(fModeFlags, *fStatus);
1529             fParenStack.push(flags, *fStatus);                            // Frame Marker
1530             fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The first NOP
1531             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP
1532 
1533             // Set the current mode flags to the new values.
1534             U_ASSERT(fNewModeFlags < 0);
1535             fModeFlags = fNewModeFlags;
1536         }
1537         break;
1538 
1539     case doBadModeFlag:
1540         error(U_REGEX_INVALID_FLAG);
1541         break;
1542 
1543     case doSuppressComments:
1544         // We have just scanned a '(?'.  We now need to prevent the character scanner from
1545         // treating a '#' as a to-the-end-of-line comment.
1546         //   (This Perl compatibility just gets uglier and uglier to do...)
1547         fEOLComments = FALSE;
1548         break;
1549 
1550 
1551     case doSetAddAmp:
1552         {
1553           UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1554           set->add(chAmp);
1555         }
1556         break;
1557 
1558     case doSetAddDash:
1559         {
1560           UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1561           set->add(chDash);
1562         }
1563         break;
1564 
1565      case doSetBackslash_s:
1566         {
1567          UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1568          set->addAll(RegexStaticSets::gStaticSets->fPropSets[URX_ISSPACE_SET]);
1569          break;
1570         }
1571 
1572      case doSetBackslash_S:
1573         {
1574             UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1575             UnicodeSet SSet;
1576             SSet.addAll(RegexStaticSets::gStaticSets->fPropSets[URX_ISSPACE_SET]).complement();
1577             set->addAll(SSet);
1578             break;
1579         }
1580 
1581     case doSetBackslash_d:
1582         {
1583             UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1584             // TODO - make a static set, ticket 6058.
1585             addCategory(set, U_GC_ND_MASK, *fStatus);
1586             break;
1587         }
1588 
1589     case doSetBackslash_D:
1590         {
1591             UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1592             UnicodeSet digits;
1593             // TODO - make a static set, ticket 6058.
1594             digits.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ND_MASK, *fStatus);
1595             digits.complement();
1596             set->addAll(digits);
1597             break;
1598         }
1599 
1600     case doSetBackslash_h:
1601         {
1602             UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1603             UnicodeSet h;
1604             h.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ZS_MASK, *fStatus);
1605             h.add((UChar32)9);   // Tab
1606             set->addAll(h);
1607             break;
1608         }
1609 
1610     case doSetBackslash_H:
1611         {
1612             UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1613             UnicodeSet h;
1614             h.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ZS_MASK, *fStatus);
1615             h.add((UChar32)9);   // Tab
1616             h.complement();
1617             set->addAll(h);
1618             break;
1619         }
1620 
1621     case doSetBackslash_v:
1622         {
1623             UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1624             set->add((UChar32)0x0a, (UChar32)0x0d);  // add range
1625             set->add((UChar32)0x85);
1626             set->add((UChar32)0x2028, (UChar32)0x2029);
1627             break;
1628         }
1629 
1630     case doSetBackslash_V:
1631         {
1632             UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1633             UnicodeSet v;
1634             v.add((UChar32)0x0a, (UChar32)0x0d);  // add range
1635             v.add((UChar32)0x85);
1636             v.add((UChar32)0x2028, (UChar32)0x2029);
1637             v.complement();
1638             set->addAll(v);
1639             break;
1640         }
1641 
1642     case doSetBackslash_w:
1643         {
1644             UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1645             set->addAll(RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET]);
1646             break;
1647         }
1648 
1649     case doSetBackslash_W:
1650         {
1651             UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1652             UnicodeSet SSet;
1653             SSet.addAll(RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET]).complement();
1654             set->addAll(SSet);
1655             break;
1656         }
1657 
1658     case doSetBegin:
1659         fixLiterals(FALSE);
1660         fSetStack.push(new UnicodeSet(), *fStatus);
1661         fSetOpStack.push(setStart, *fStatus);
1662         if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) {
1663             fSetOpStack.push(setCaseClose, *fStatus);
1664         }
1665         break;
1666 
1667     case doSetBeginDifference1:
1668         //  We have scanned something like [[abc]-[
1669         //  Set up a new UnicodeSet for the set beginning with the just-scanned '['
1670         //  Push a Difference operator, which will cause the new set to be subtracted from what
1671         //    went before once it is created.
1672         setPushOp(setDifference1);
1673         fSetOpStack.push(setStart, *fStatus);
1674         if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) {
1675             fSetOpStack.push(setCaseClose, *fStatus);
1676         }
1677         break;
1678 
1679     case doSetBeginIntersection1:
1680         //  We have scanned something like  [[abc]&[
1681         //   Need both the '&' operator and the open '[' operator.
1682         setPushOp(setIntersection1);
1683         fSetOpStack.push(setStart, *fStatus);
1684         if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) {
1685             fSetOpStack.push(setCaseClose, *fStatus);
1686         }
1687         break;
1688 
1689     case doSetBeginUnion:
1690         //  We have scanned something like  [[abc][
1691         //     Need to handle the union operation explicitly [[abc] | [
1692         setPushOp(setUnion);
1693         fSetOpStack.push(setStart, *fStatus);
1694         if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) {
1695             fSetOpStack.push(setCaseClose, *fStatus);
1696         }
1697         break;
1698 
1699     case doSetDifference2:
1700         // We have scanned something like [abc--
1701         //   Consider this to unambiguously be a set difference operator.
1702         setPushOp(setDifference2);
1703         break;
1704 
1705     case doSetEnd:
1706         // Have encountered the ']' that closes a set.
1707         //    Force the evaluation of any pending operations within this set,
1708         //    leave the completed set on the top of the set stack.
1709         setEval(setEnd);
1710         U_ASSERT(fSetOpStack.peeki()==setStart);
1711         fSetOpStack.popi();
1712         break;
1713 
1714     case doSetFinish:
1715         {
1716         // Finished a complete set expression, including all nested sets.
1717         //   The close bracket has already triggered clearing out pending set operators,
1718         //    the operator stack should be empty and the operand stack should have just
1719         //    one entry, the result set.
1720         U_ASSERT(fSetOpStack.empty());
1721         UnicodeSet *theSet = (UnicodeSet *)fSetStack.pop();
1722         U_ASSERT(fSetStack.empty());
1723         compileSet(theSet);
1724         break;
1725         }
1726 
1727     case doSetIntersection2:
1728         // Have scanned something like [abc&&
1729         setPushOp(setIntersection2);
1730         break;
1731 
1732     case doSetLiteral:
1733         // Union the just-scanned literal character into the set being built.
1734         //    This operation is the highest precedence set operation, so we can always do
1735         //    it immediately, without waiting to see what follows.  It is necessary to perform
1736         //    any pending '-' or '&' operation first, because these have the same precedence
1737         //    as union-ing in a literal'
1738         {
1739             setEval(setUnion);
1740             UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
1741             s->add(fC.fChar);
1742             fLastSetLiteral = fC.fChar;
1743             break;
1744         }
1745 
1746     case doSetLiteralEscaped:
1747         // A back-slash escaped literal character was encountered.
1748         // Processing is the same as with setLiteral, above, with the addition of
1749         //  the optional check for errors on escaped ASCII letters.
1750         {
1751             if ((fModeFlags & UREGEX_ERROR_ON_UNKNOWN_ESCAPES) != 0 &&
1752                 ((fC.fChar >= 0x41 && fC.fChar<= 0x5A) ||     // in [A-Z]
1753                  (fC.fChar >= 0x61 && fC.fChar <= 0x7a))) {   // in [a-z]
1754                 error(U_REGEX_BAD_ESCAPE_SEQUENCE);
1755             }
1756             setEval(setUnion);
1757             UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
1758             s->add(fC.fChar);
1759             fLastSetLiteral = fC.fChar;
1760             break;
1761         }
1762 
1763         case doSetNamedChar:
1764         // Scanning a \N{UNICODE CHARACTER NAME}
1765         //  Aside from the source of the character, the processing is identical to doSetLiteral,
1766         //    above.
1767         {
1768             UChar32  c = scanNamedChar();
1769             setEval(setUnion);
1770             UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
1771             s->add(c);
1772             fLastSetLiteral = c;
1773             break;
1774         }
1775 
1776     case doSetNamedRange:
1777         // We have scanned literal-\N{CHAR NAME}.  Add the range to the set.
1778         // The left character is already in the set, and is saved in fLastSetLiteral.
1779         // The right side needs to be picked up, the scan is at the 'N'.
1780         // Lower Limit > Upper limit being an error matches both Java
1781         //        and ICU UnicodeSet behavior.
1782         {
1783             UChar32  c = scanNamedChar();
1784             if (U_SUCCESS(*fStatus) && (fLastSetLiteral == U_SENTINEL || fLastSetLiteral > c)) {
1785                 error(U_REGEX_INVALID_RANGE);
1786             }
1787             UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
1788             s->add(fLastSetLiteral, c);
1789             fLastSetLiteral = c;
1790             break;
1791         }
1792 
1793 
1794     case  doSetNegate:
1795         // Scanned a '^' at the start of a set.
1796         // Push the negation operator onto the set op stack.
1797         // A twist for case-insensitive matching:
1798         //   the case closure operation must happen _before_ negation.
1799         //   But the case closure operation will already be on the stack if it's required.
1800         //   This requires checking for case closure, and swapping the stack order
1801         //    if it is present.
1802         {
1803             int32_t  tosOp = fSetOpStack.peeki();
1804             if (tosOp == setCaseClose) {
1805                 fSetOpStack.popi();
1806                 fSetOpStack.push(setNegation, *fStatus);
1807                 fSetOpStack.push(setCaseClose, *fStatus);
1808             } else {
1809                 fSetOpStack.push(setNegation, *fStatus);
1810             }
1811         }
1812         break;
1813 
1814     case doSetNoCloseError:
1815         error(U_REGEX_MISSING_CLOSE_BRACKET);
1816         break;
1817 
1818     case doSetOpError:
1819         error(U_REGEX_RULE_SYNTAX);   //  -- or && at the end of a set.  Illegal.
1820         break;
1821 
1822     case doSetPosixProp:
1823         {
1824             UnicodeSet *s = scanPosixProp();
1825             if (s != NULL) {
1826                 UnicodeSet *tos = (UnicodeSet *)fSetStack.peek();
1827                 tos->addAll(*s);
1828                 delete s;
1829             }  // else error.  scanProp() reported the error status already.
1830         }
1831         break;
1832 
1833     case doSetProp:
1834         //  Scanned a \p \P within [brackets].
1835         {
1836             UnicodeSet *s = scanProp();
1837             if (s != NULL) {
1838                 UnicodeSet *tos = (UnicodeSet *)fSetStack.peek();
1839                 tos->addAll(*s);
1840                 delete s;
1841             }  // else error.  scanProp() reported the error status already.
1842         }
1843         break;
1844 
1845 
1846     case doSetRange:
1847         // We have scanned literal-literal.  Add the range to the set.
1848         // The left character is already in the set, and is saved in fLastSetLiteral.
1849         // The right side is the current character.
1850         // Lower Limit > Upper limit being an error matches both Java
1851         //        and ICU UnicodeSet behavior.
1852         {
1853 
1854         if (fLastSetLiteral == U_SENTINEL || fLastSetLiteral > fC.fChar) {
1855             error(U_REGEX_INVALID_RANGE);
1856         }
1857         UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
1858         s->add(fLastSetLiteral, fC.fChar);
1859         break;
1860         }
1861 
1862     default:
1863         UPRV_UNREACHABLE;
1864     }
1865 
1866     if (U_FAILURE(*fStatus)) {
1867         returnVal = FALSE;
1868     }
1869 
1870     return returnVal;
1871 }
1872 
1873 
1874 
1875 //------------------------------------------------------------------------------
1876 //
1877 //   literalChar           We've encountered a literal character from the pattern,
1878 //                             or an escape sequence that reduces to a character.
1879 //                         Add it to the string containing all literal chars/strings from
1880 //                             the pattern.
1881 //
1882 //------------------------------------------------------------------------------
literalChar(UChar32 c)1883 void RegexCompile::literalChar(UChar32 c)  {
1884     fLiteralChars.append(c);
1885 }
1886 
1887 
1888 //------------------------------------------------------------------------------
1889 //
1890 //    fixLiterals           When compiling something that can follow a literal
1891 //                          string in a pattern, emit the code to match the
1892 //                          accumulated literal string.
1893 //
1894 //                          Optionally, split the last char of the string off into
1895 //                          a single "ONE_CHAR" operation, so that quantifiers can
1896 //                          apply to that char alone.  Example:   abc*
1897 //                          The * must apply to the 'c' only.
1898 //
1899 //------------------------------------------------------------------------------
fixLiterals(UBool split)1900 void    RegexCompile::fixLiterals(UBool split) {
1901 
1902     // If no literal characters have been scanned but not yet had code generated
1903     //   for them, nothing needs to be done.
1904     if (fLiteralChars.length() == 0) {
1905         return;
1906     }
1907 
1908     int32_t indexOfLastCodePoint = fLiteralChars.moveIndex32(fLiteralChars.length(), -1);
1909     UChar32 lastCodePoint = fLiteralChars.char32At(indexOfLastCodePoint);
1910 
1911     // Split:  We need to  ensure that the last item in the compiled pattern
1912     //     refers only to the last literal scanned in the pattern, so that
1913     //     quantifiers (*, +, etc.) affect only it, and not a longer string.
1914     //     Split before case folding for case insensitive matches.
1915 
1916     if (split) {
1917         fLiteralChars.truncate(indexOfLastCodePoint);
1918         fixLiterals(FALSE);   // Recursive call, emit code to match the first part of the string.
1919                               //  Note that the truncated literal string may be empty, in which case
1920                               //  nothing will be emitted.
1921 
1922         literalChar(lastCodePoint);  // Re-add the last code point as if it were a new literal.
1923         fixLiterals(FALSE);          // Second recursive call, code for the final code point.
1924         return;
1925     }
1926 
1927     // If we are doing case-insensitive matching, case fold the string.  This may expand
1928     //   the string, e.g. the German sharp-s turns into "ss"
1929     if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
1930         fLiteralChars.foldCase();
1931         indexOfLastCodePoint = fLiteralChars.moveIndex32(fLiteralChars.length(), -1);
1932         lastCodePoint = fLiteralChars.char32At(indexOfLastCodePoint);
1933     }
1934 
1935     if (indexOfLastCodePoint == 0) {
1936         // Single character, emit a URX_ONECHAR op to match it.
1937         if ((fModeFlags & UREGEX_CASE_INSENSITIVE) &&
1938                  u_hasBinaryProperty(lastCodePoint, UCHAR_CASE_SENSITIVE)) {
1939             appendOp(URX_ONECHAR_I, lastCodePoint);
1940         } else {
1941             appendOp(URX_ONECHAR, lastCodePoint);
1942         }
1943     } else {
1944         // Two or more chars, emit a URX_STRING to match them.
1945         if (fLiteralChars.length() > 0x00ffffff || fRXPat->fLiteralText.length() > 0x00ffffff) {
1946             error(U_REGEX_PATTERN_TOO_BIG);
1947         }
1948         if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
1949             appendOp(URX_STRING_I, fRXPat->fLiteralText.length());
1950         } else {
1951             // TODO here:  add optimization to split case sensitive strings of length two
1952             //             into two single char ops, for efficiency.
1953             appendOp(URX_STRING, fRXPat->fLiteralText.length());
1954         }
1955         appendOp(URX_STRING_LEN, fLiteralChars.length());
1956 
1957         // Add this string into the accumulated strings of the compiled pattern.
1958         fRXPat->fLiteralText.append(fLiteralChars);
1959     }
1960 
1961     fLiteralChars.remove();
1962 }
1963 
1964 
buildOp(int32_t type,int32_t val)1965 int32_t RegexCompile::buildOp(int32_t type, int32_t val) {
1966     if (U_FAILURE(*fStatus)) {
1967         return 0;
1968     }
1969     if (type < 0 || type > 255) {
1970         UPRV_UNREACHABLE;
1971     }
1972     if (val > 0x00ffffff) {
1973         UPRV_UNREACHABLE;
1974     }
1975     if (val < 0) {
1976         if (!(type == URX_RESERVED_OP_N || type == URX_RESERVED_OP)) {
1977             UPRV_UNREACHABLE;
1978         }
1979         if (URX_TYPE(val) != 0xff) {
1980             UPRV_UNREACHABLE;
1981         }
1982         type = URX_RESERVED_OP_N;
1983     }
1984     return (type << 24) | val;
1985 }
1986 
1987 
1988 //------------------------------------------------------------------------------
1989 //
1990 //   appendOp()             Append a new instruction onto the compiled pattern
1991 //                          Includes error checking, limiting the size of the
1992 //                          pattern to lengths that can be represented in the
1993 //                          24 bit operand field of an instruction.
1994 //
1995 //------------------------------------------------------------------------------
appendOp(int32_t op)1996 void RegexCompile::appendOp(int32_t op) {
1997     if (U_FAILURE(*fStatus)) {
1998         return;
1999     }
2000     fRXPat->fCompiledPat->addElement(op, *fStatus);
2001     if ((fRXPat->fCompiledPat->size() > 0x00fffff0) && U_SUCCESS(*fStatus)) {
2002         error(U_REGEX_PATTERN_TOO_BIG);
2003     }
2004 }
2005 
appendOp(int32_t type,int32_t val)2006 void RegexCompile::appendOp(int32_t type, int32_t val) {
2007     appendOp(buildOp(type, val));
2008 }
2009 
2010 
2011 //------------------------------------------------------------------------------
2012 //
2013 //   insertOp()             Insert a slot for a new opcode into the already
2014 //                          compiled pattern code.
2015 //
2016 //                          Fill the slot with a NOP.  Our caller will replace it
2017 //                          with what they really wanted.
2018 //
2019 //------------------------------------------------------------------------------
insertOp(int32_t where)2020 void   RegexCompile::insertOp(int32_t where) {
2021     UVector64 *code = fRXPat->fCompiledPat;
2022     U_ASSERT(where>0 && where < code->size());
2023 
2024     int32_t  nop = buildOp(URX_NOP, 0);
2025     code->insertElementAt(nop, where, *fStatus);
2026 
2027     // Walk through the pattern, looking for any ops with targets that
2028     //  were moved down by the insert.  Fix them.
2029     int32_t loc;
2030     for (loc=0; loc<code->size(); loc++) {
2031         int32_t op = (int32_t)code->elementAti(loc);
2032         int32_t opType = URX_TYPE(op);
2033         int32_t opValue = URX_VAL(op);
2034         if ((opType == URX_JMP         ||
2035             opType == URX_JMPX         ||
2036             opType == URX_STATE_SAVE   ||
2037             opType == URX_CTR_LOOP     ||
2038             opType == URX_CTR_LOOP_NG  ||
2039             opType == URX_JMP_SAV      ||
2040             opType == URX_JMP_SAV_X    ||
2041             opType == URX_RELOC_OPRND)    && opValue > where) {
2042             // Target location for this opcode is after the insertion point and
2043             //   needs to be incremented to adjust for the insertion.
2044             opValue++;
2045             op = buildOp(opType, opValue);
2046             code->setElementAt(op, loc);
2047         }
2048     }
2049 
2050     // Now fix up the parentheses stack.  All positive values in it are locations in
2051     //  the compiled pattern.   (Negative values are frame boundaries, and don't need fixing.)
2052     for (loc=0; loc<fParenStack.size(); loc++) {
2053         int32_t x = fParenStack.elementAti(loc);
2054         U_ASSERT(x < code->size());
2055         if (x>where) {
2056             x++;
2057             fParenStack.setElementAt(x, loc);
2058         }
2059     }
2060 
2061     if (fMatchCloseParen > where) {
2062         fMatchCloseParen++;
2063     }
2064     if (fMatchOpenParen > where) {
2065         fMatchOpenParen++;
2066     }
2067 }
2068 
2069 
2070 //------------------------------------------------------------------------------
2071 //
2072 //   allocateData()        Allocate storage in the matcher's static data area.
2073 //                         Return the index for the newly allocated data.
2074 //                         The storage won't actually exist until we are running a match
2075 //                         operation, but the storage indexes are inserted into various
2076 //                         opcodes while compiling the pattern.
2077 //
2078 //------------------------------------------------------------------------------
allocateData(int32_t size)2079 int32_t RegexCompile::allocateData(int32_t size) {
2080     if (U_FAILURE(*fStatus)) {
2081         return 0;
2082     }
2083     if (size <= 0 || size > 0x100 || fRXPat->fDataSize < 0) {
2084         error(U_REGEX_INTERNAL_ERROR);
2085         return 0;
2086     }
2087     int32_t dataIndex = fRXPat->fDataSize;
2088     fRXPat->fDataSize += size;
2089     if (fRXPat->fDataSize >= 0x00fffff0) {
2090         error(U_REGEX_INTERNAL_ERROR);
2091     }
2092     return dataIndex;
2093 }
2094 
2095 
2096 //------------------------------------------------------------------------------
2097 //
2098 //   allocateStackData()   Allocate space in the back-tracking stack frame.
2099 //                         Return the index for the newly allocated data.
2100 //                         The frame indexes are inserted into various
2101 //                         opcodes while compiling the pattern, meaning that frame
2102 //                         size must be restricted to the size that will fit
2103 //                         as an operand (24 bits).
2104 //
2105 //------------------------------------------------------------------------------
allocateStackData(int32_t size)2106 int32_t RegexCompile::allocateStackData(int32_t size) {
2107     if (U_FAILURE(*fStatus)) {
2108         return 0;
2109     }
2110     if (size <= 0 || size > 0x100 || fRXPat->fFrameSize < 0) {
2111         error(U_REGEX_INTERNAL_ERROR);
2112         return 0;
2113     }
2114     int32_t dataIndex = fRXPat->fFrameSize;
2115     fRXPat->fFrameSize += size;
2116     if (fRXPat->fFrameSize >= 0x00fffff0) {
2117         error(U_REGEX_PATTERN_TOO_BIG);
2118     }
2119     return dataIndex;
2120 }
2121 
2122 
2123 //------------------------------------------------------------------------------
2124 //
2125 //   blockTopLoc()          Find or create a location in the compiled pattern
2126 //                          at the start of the operation or block that has
2127 //                          just been compiled.  Needed when a quantifier (* or
2128 //                          whatever) appears, and we need to add an operation
2129 //                          at the start of the thing being quantified.
2130 //
2131 //                          (Parenthesized Blocks) have a slot with a NOP that
2132 //                          is reserved for this purpose.  .* or similar don't
2133 //                          and a slot needs to be added.
2134 //
2135 //       parameter reserveLoc   :  TRUE -  ensure that there is space to add an opcode
2136 //                                         at the returned location.
2137 //                                 FALSE - just return the address,
2138 //                                         do not reserve a location there.
2139 //
2140 //------------------------------------------------------------------------------
blockTopLoc(UBool reserveLoc)2141 int32_t   RegexCompile::blockTopLoc(UBool reserveLoc) {
2142     int32_t   theLoc;
2143     fixLiterals(TRUE);  // Emit code for any pending literals.
2144                         //   If last item was a string, emit separate op for the its last char.
2145     if (fRXPat->fCompiledPat->size() == fMatchCloseParen)
2146     {
2147         // The item just processed is a parenthesized block.
2148         theLoc = fMatchOpenParen;   // A slot is already reserved for us.
2149         U_ASSERT(theLoc > 0);
2150         U_ASSERT(URX_TYPE(((uint32_t)fRXPat->fCompiledPat->elementAti(theLoc))) == URX_NOP);
2151     }
2152     else {
2153         // Item just compiled is a single thing, a ".", or a single char, a string or a set reference.
2154         // No slot for STATE_SAVE was pre-reserved in the compiled code.
2155         // We need to make space now.
2156         theLoc = fRXPat->fCompiledPat->size()-1;
2157         int32_t opAtTheLoc = (int32_t)fRXPat->fCompiledPat->elementAti(theLoc);
2158         if (URX_TYPE(opAtTheLoc) == URX_STRING_LEN) {
2159             // Strings take two opcode, we want the position of the first one.
2160             // We can have a string at this point if a single character case-folded to two.
2161             theLoc--;
2162         }
2163         if (reserveLoc) {
2164             int32_t  nop = buildOp(URX_NOP, 0);
2165             fRXPat->fCompiledPat->insertElementAt(nop, theLoc, *fStatus);
2166         }
2167     }
2168     return theLoc;
2169 }
2170 
2171 
2172 
2173 //------------------------------------------------------------------------------
2174 //
2175 //    handleCloseParen      When compiling a close paren, we need to go back
2176 //                          and fix up any JMP or SAVE operations within the
2177 //                          parenthesized block that need to target the end
2178 //                          of the block.  The locations of these are kept on
2179 //                          the paretheses stack.
2180 //
2181 //                          This function is called both when encountering a
2182 //                          real ) and at the end of the pattern.
2183 //
2184 //------------------------------------------------------------------------------
handleCloseParen()2185 void  RegexCompile::handleCloseParen() {
2186     int32_t   patIdx;
2187     int32_t   patOp;
2188     if (fParenStack.size() <= 0) {
2189         error(U_REGEX_MISMATCHED_PAREN);
2190         return;
2191     }
2192 
2193     // Emit code for any pending literals.
2194     fixLiterals(FALSE);
2195 
2196     // Fixup any operations within the just-closed parenthesized group
2197     //    that need to reference the end of the (block).
2198     //    (The first one popped from the stack is an unused slot for
2199     //     alternation (OR) state save, but applying the fixup to it does no harm.)
2200     for (;;) {
2201         patIdx = fParenStack.popi();
2202         if (patIdx < 0) {
2203             // value < 0 flags the start of the frame on the paren stack.
2204             break;
2205         }
2206         U_ASSERT(patIdx>0 && patIdx <= fRXPat->fCompiledPat->size());
2207         patOp = (int32_t)fRXPat->fCompiledPat->elementAti(patIdx);
2208         U_ASSERT(URX_VAL(patOp) == 0);          // Branch target for JMP should not be set.
2209         patOp |= fRXPat->fCompiledPat->size();  // Set it now.
2210         fRXPat->fCompiledPat->setElementAt(patOp, patIdx);
2211         fMatchOpenParen     = patIdx;
2212     }
2213 
2214     //  At the close of any parenthesized block, restore the match mode flags  to
2215     //  the value they had at the open paren.  Saved value is
2216     //  at the top of the paren stack.
2217     fModeFlags = fParenStack.popi();
2218     U_ASSERT(fModeFlags < 0);
2219 
2220     // DO any additional fixups, depending on the specific kind of
2221     // parentesized grouping this is
2222 
2223     switch (patIdx) {
2224     case plain:
2225     case flags:
2226         // No additional fixups required.
2227         //   (Grouping-only parentheses)
2228         break;
2229     case capturing:
2230         // Capturing Parentheses.
2231         //   Insert a End Capture op into the pattern.
2232         //   The frame offset of the variables for this cg is obtained from the
2233         //       start capture op and put it into the end-capture op.
2234         {
2235             int32_t   captureOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen+1);
2236             U_ASSERT(URX_TYPE(captureOp) == URX_START_CAPTURE);
2237 
2238             int32_t   frameVarLocation = URX_VAL(captureOp);
2239             appendOp(URX_END_CAPTURE, frameVarLocation);
2240         }
2241         break;
2242     case atomic:
2243         // Atomic Parenthesis.
2244         //   Insert a LD_SP operation to restore the state stack to the position
2245         //   it was when the atomic parens were entered.
2246         {
2247             int32_t   stoOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen+1);
2248             U_ASSERT(URX_TYPE(stoOp) == URX_STO_SP);
2249             int32_t   stoLoc = URX_VAL(stoOp);
2250             appendOp(URX_LD_SP, stoLoc);
2251         }
2252         break;
2253 
2254     case lookAhead:
2255         {
2256             int32_t  startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-5);
2257             U_ASSERT(URX_TYPE(startOp) == URX_LA_START);
2258             int32_t dataLoc  = URX_VAL(startOp);
2259             appendOp(URX_LA_END, dataLoc);
2260         }
2261         break;
2262 
2263     case negLookAhead:
2264         {
2265             // See comment at doOpenLookAheadNeg
2266             int32_t  startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-1);
2267             U_ASSERT(URX_TYPE(startOp) == URX_LA_START);
2268             int32_t dataLoc  = URX_VAL(startOp);
2269             appendOp(URX_LA_END, dataLoc);
2270             appendOp(URX_BACKTRACK, 0);
2271             appendOp(URX_LA_END, dataLoc);
2272 
2273             // Patch the URX_SAVE near the top of the block.
2274             // The destination of the SAVE is the final LA_END that was just added.
2275             int32_t saveOp   = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen);
2276             U_ASSERT(URX_TYPE(saveOp) == URX_STATE_SAVE);
2277             int32_t dest     = fRXPat->fCompiledPat->size()-1;
2278             saveOp           = buildOp(URX_STATE_SAVE, dest);
2279             fRXPat->fCompiledPat->setElementAt(saveOp, fMatchOpenParen);
2280         }
2281         break;
2282 
2283     case lookBehind:
2284         {
2285             // See comment at doOpenLookBehind.
2286 
2287             // Append the URX_LB_END and URX_LA_END to the compiled pattern.
2288             int32_t  startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-4);
2289             U_ASSERT(URX_TYPE(startOp) == URX_LB_START);
2290             int32_t dataLoc  = URX_VAL(startOp);
2291             appendOp(URX_LB_END, dataLoc);
2292             appendOp(URX_LA_END, dataLoc);
2293 
2294             // Determine the min and max bounds for the length of the
2295             //  string that the pattern can match.
2296             //  An unbounded upper limit is an error.
2297             int32_t patEnd   = fRXPat->fCompiledPat->size() - 1;
2298             int32_t minML    = minMatchLength(fMatchOpenParen, patEnd);
2299             int32_t maxML    = maxMatchLength(fMatchOpenParen, patEnd);
2300             if (URX_TYPE(maxML) != 0) {
2301                 error(U_REGEX_LOOK_BEHIND_LIMIT);
2302                 break;
2303             }
2304             if (maxML == INT32_MAX) {
2305                 error(U_REGEX_LOOK_BEHIND_LIMIT);
2306                 break;
2307             }
2308             if (minML == INT32_MAX) {
2309                 // This condition happens when no match is possible, such as with a
2310                 // [set] expression containing no elements.
2311                 // In principle, the generated code to evaluate the expression could be deleted,
2312                 // but it's probably not worth the complication.
2313                 minML = 0;
2314             }
2315             U_ASSERT(minML <= maxML);
2316 
2317             // Insert the min and max match len bounds into the URX_LB_CONT op that
2318             //  appears at the top of the look-behind block, at location fMatchOpenParen+1
2319             fRXPat->fCompiledPat->setElementAt(minML,  fMatchOpenParen-2);
2320             fRXPat->fCompiledPat->setElementAt(maxML,  fMatchOpenParen-1);
2321 
2322         }
2323         break;
2324 
2325 
2326 
2327     case lookBehindN:
2328         {
2329             // See comment at doOpenLookBehindNeg.
2330 
2331             // Append the URX_LBN_END to the compiled pattern.
2332             int32_t  startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-5);
2333             U_ASSERT(URX_TYPE(startOp) == URX_LB_START);
2334             int32_t dataLoc  = URX_VAL(startOp);
2335             appendOp(URX_LBN_END, dataLoc);
2336 
2337             // Determine the min and max bounds for the length of the
2338             //  string that the pattern can match.
2339             //  An unbounded upper limit is an error.
2340             int32_t patEnd   = fRXPat->fCompiledPat->size() - 1;
2341             int32_t minML    = minMatchLength(fMatchOpenParen, patEnd);
2342             int32_t maxML    = maxMatchLength(fMatchOpenParen, patEnd);
2343             if (URX_TYPE(maxML) != 0) {
2344                 error(U_REGEX_LOOK_BEHIND_LIMIT);
2345                 break;
2346             }
2347             if (maxML == INT32_MAX) {
2348                 error(U_REGEX_LOOK_BEHIND_LIMIT);
2349                 break;
2350             }
2351             if (minML == INT32_MAX) {
2352                 // This condition happens when no match is possible, such as with a
2353                 // [set] expression containing no elements.
2354                 // In principle, the generated code to evaluate the expression could be deleted,
2355                 // but it's probably not worth the complication.
2356                 minML = 0;
2357             }
2358 
2359             U_ASSERT(minML <= maxML);
2360 
2361             // Insert the min and max match len bounds into the URX_LB_CONT op that
2362             //  appears at the top of the look-behind block, at location fMatchOpenParen+1
2363             fRXPat->fCompiledPat->setElementAt(minML,  fMatchOpenParen-3);
2364             fRXPat->fCompiledPat->setElementAt(maxML,  fMatchOpenParen-2);
2365 
2366             // Insert the pattern location to continue at after a successful match
2367             //  as the last operand of the URX_LBN_CONT
2368             int32_t op = buildOp(URX_RELOC_OPRND, fRXPat->fCompiledPat->size());
2369             fRXPat->fCompiledPat->setElementAt(op,  fMatchOpenParen-1);
2370         }
2371         break;
2372 
2373 
2374 
2375     default:
2376         UPRV_UNREACHABLE;
2377     }
2378 
2379     // remember the next location in the compiled pattern.
2380     // The compilation of Quantifiers will look at this to see whether its looping
2381     //   over a parenthesized block or a single item
2382     fMatchCloseParen = fRXPat->fCompiledPat->size();
2383 }
2384 
2385 
2386 
2387 //------------------------------------------------------------------------------
2388 //
2389 //   compileSet       Compile the pattern operations for a reference to a
2390 //                    UnicodeSet.
2391 //
2392 //------------------------------------------------------------------------------
compileSet(UnicodeSet * theSet)2393 void        RegexCompile::compileSet(UnicodeSet *theSet)
2394 {
2395     if (theSet == NULL) {
2396         return;
2397     }
2398     //  Remove any strings from the set.
2399     //  There shoudn't be any, but just in case.
2400     //     (Case Closure can add them; if we had a simple case closure avaialble that
2401     //      ignored strings, that would be better.)
2402     theSet->removeAllStrings();
2403     int32_t  setSize = theSet->size();
2404 
2405     switch (setSize) {
2406     case 0:
2407         {
2408             // Set of no elements.   Always fails to match.
2409             appendOp(URX_BACKTRACK, 0);
2410             delete theSet;
2411         }
2412         break;
2413 
2414     case 1:
2415         {
2416             // The set contains only a single code point.  Put it into
2417             //   the compiled pattern as a single char operation rather
2418             //   than a set, and discard the set itself.
2419             literalChar(theSet->charAt(0));
2420             delete theSet;
2421         }
2422         break;
2423 
2424     default:
2425         {
2426             //  The set contains two or more chars.  (the normal case)
2427             //  Put it into the compiled pattern as a set.
2428             theSet->freeze();
2429             int32_t setNumber = fRXPat->fSets->size();
2430             fRXPat->fSets->addElement(theSet, *fStatus);
2431             appendOp(URX_SETREF, setNumber);
2432         }
2433     }
2434 }
2435 
2436 
2437 //------------------------------------------------------------------------------
2438 //
2439 //   compileInterval    Generate the code for a {min, max} style interval quantifier.
2440 //                      Except for the specific opcodes used, the code is the same
2441 //                      for all three types (greedy, non-greedy, possessive) of
2442 //                      intervals.  The opcodes are supplied as parameters.
2443 //                      (There are two sets of opcodes - greedy & possessive use the
2444 //                      same ones, while non-greedy has it's own.)
2445 //
2446 //                      The code for interval loops has this form:
2447 //                         0  CTR_INIT   counter loc (in stack frame)
2448 //                         1             5  patt address of CTR_LOOP at bottom of block
2449 //                         2             min count
2450 //                         3             max count   (-1 for unbounded)
2451 //                         4  ...        block to be iterated over
2452 //                         5  CTR_LOOP
2453 //
2454 //                       In
2455 //------------------------------------------------------------------------------
compileInterval(int32_t InitOp,int32_t LoopOp)2456 void        RegexCompile::compileInterval(int32_t InitOp,  int32_t LoopOp)
2457 {
2458     // The CTR_INIT op at the top of the block with the {n,m} quantifier takes
2459     //   four slots in the compiled code.  Reserve them.
2460     int32_t   topOfBlock = blockTopLoc(TRUE);
2461     insertOp(topOfBlock);
2462     insertOp(topOfBlock);
2463     insertOp(topOfBlock);
2464 
2465     // The operands for the CTR_INIT opcode include the index in the matcher data
2466     //   of the counter.  Allocate it now. There are two data items
2467     //        counterLoc   -->  Loop counter
2468     //               +1    -->  Input index (for breaking non-progressing loops)
2469     //                          (Only present if unbounded upper limit on loop)
2470     int32_t   dataSize = fIntervalUpper < 0 ? 2 : 1;
2471     int32_t   counterLoc = allocateStackData(dataSize);
2472 
2473     int32_t   op = buildOp(InitOp, counterLoc);
2474     fRXPat->fCompiledPat->setElementAt(op, topOfBlock);
2475 
2476     // The second operand of CTR_INIT is the location following the end of the loop.
2477     //   Must put in as a URX_RELOC_OPRND so that the value will be adjusted if the
2478     //   compilation of something later on causes the code to grow and the target
2479     //   position to move.
2480     int32_t loopEnd = fRXPat->fCompiledPat->size();
2481     op = buildOp(URX_RELOC_OPRND, loopEnd);
2482     fRXPat->fCompiledPat->setElementAt(op, topOfBlock+1);
2483 
2484     // Followed by the min and max counts.
2485     fRXPat->fCompiledPat->setElementAt(fIntervalLow, topOfBlock+2);
2486     fRXPat->fCompiledPat->setElementAt(fIntervalUpper, topOfBlock+3);
2487 
2488     // Apend the CTR_LOOP op.  The operand is the location of the CTR_INIT op.
2489     //   Goes at end of the block being looped over, so just append to the code so far.
2490     appendOp(LoopOp, topOfBlock);
2491 
2492     if ((fIntervalLow & 0xff000000) != 0 ||
2493         (fIntervalUpper > 0 && (fIntervalUpper & 0xff000000) != 0)) {
2494             error(U_REGEX_NUMBER_TOO_BIG);
2495         }
2496 
2497     if (fIntervalLow > fIntervalUpper && fIntervalUpper != -1) {
2498         error(U_REGEX_MAX_LT_MIN);
2499     }
2500 }
2501 
2502 
2503 
compileInlineInterval()2504 UBool RegexCompile::compileInlineInterval() {
2505     if (fIntervalUpper > 10 || fIntervalUpper < fIntervalLow) {
2506         // Too big to inline.  Fail, which will cause looping code to be generated.
2507         //   (Upper < Lower picks up unbounded upper and errors, both.)
2508         return FALSE;
2509     }
2510 
2511     int32_t   topOfBlock = blockTopLoc(FALSE);
2512     if (fIntervalUpper == 0) {
2513         // Pathological case.  Attempt no matches, as if the block doesn't exist.
2514         // Discard the generated code for the block.
2515         // If the block included parens, discard the info pertaining to them as well.
2516         fRXPat->fCompiledPat->setSize(topOfBlock);
2517         if (fMatchOpenParen >= topOfBlock) {
2518             fMatchOpenParen = -1;
2519         }
2520         if (fMatchCloseParen >= topOfBlock) {
2521             fMatchCloseParen = -1;
2522         }
2523         return TRUE;
2524     }
2525 
2526     if (topOfBlock != fRXPat->fCompiledPat->size()-1 && fIntervalUpper != 1) {
2527         // The thing being repeated is not a single op, but some
2528         //   more complex block.  Do it as a loop, not inlines.
2529         //   Note that things "repeated" a max of once are handled as inline, because
2530         //     the one copy of the code already generated is just fine.
2531         return FALSE;
2532     }
2533 
2534     // Pick up the opcode that is to be repeated
2535     //
2536     int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(topOfBlock);
2537 
2538     // Compute the pattern location where the inline sequence
2539     //   will end, and set up the state save op that will be needed.
2540     //
2541     int32_t endOfSequenceLoc = fRXPat->fCompiledPat->size()-1
2542                                 + fIntervalUpper + (fIntervalUpper-fIntervalLow);
2543     int32_t saveOp = buildOp(URX_STATE_SAVE, endOfSequenceLoc);
2544     if (fIntervalLow == 0) {
2545         insertOp(topOfBlock);
2546         fRXPat->fCompiledPat->setElementAt(saveOp, topOfBlock);
2547     }
2548 
2549 
2550 
2551     //  Loop, emitting the op for the thing being repeated each time.
2552     //    Loop starts at 1 because one instance of the op already exists in the pattern,
2553     //    it was put there when it was originally encountered.
2554     int32_t i;
2555     for (i=1; i<fIntervalUpper; i++ ) {
2556         if (i >= fIntervalLow) {
2557             appendOp(saveOp);
2558         }
2559         appendOp(op);
2560     }
2561     return TRUE;
2562 }
2563 
2564 
2565 
2566 //------------------------------------------------------------------------------
2567 //
2568 //   caseInsensitiveStart  given a single code point from a pattern string, determine the
2569 //                         set of characters that could potentially begin a case-insensitive
2570 //                         match of a string beginning with that character, using full Unicode
2571 //                         case insensitive matching.
2572 //
2573 //          This is used in optimizing find().
2574 //
2575 //          closeOver(USET_CASE_INSENSITIVE) does most of what is needed, but
2576 //          misses cases like this:
2577 //             A string from the pattern begins with 'ss' (although all we know
2578 //                 in this context is that it begins with 's')
2579 //             The pattern could match a string beginning with a German sharp-s
2580 //
2581 //           To the ordinary case closure for a character c, we add all other
2582 //           characters cx where the case closure of cx incudes a string form that begins
2583 //           with the original character c.
2584 //
2585 //           This function could be made smarter. The full pattern string is available
2586 //           and it would be possible to verify that the extra characters being added
2587 //           to the starting set fully match, rather than having just a first-char of the
2588 //           folded form match.
2589 //
2590 //------------------------------------------------------------------------------
findCaseInsensitiveStarters(UChar32 c,UnicodeSet * starterChars)2591 void  RegexCompile::findCaseInsensitiveStarters(UChar32 c, UnicodeSet *starterChars) {
2592 
2593 // Machine Generated below.
2594 // It may need updating with new versions of Unicode.
2595 // Intltest test RegexTest::TestCaseInsensitiveStarters will fail if an update is needed.
2596 // The update tool is here: svn+ssh://source.icu-project.org/repos/icu/tools/trunk/unicode/c/genregexcasing
2597 
2598 // Machine Generated Data. Do not hand edit.
2599     static const UChar32 RECaseFixCodePoints[] = {
2600         0x61, 0x66, 0x68, 0x69, 0x6a, 0x73, 0x74, 0x77, 0x79, 0x2bc,
2601         0x3ac, 0x3ae, 0x3b1, 0x3b7, 0x3b9, 0x3c1, 0x3c5, 0x3c9, 0x3ce, 0x565,
2602         0x574, 0x57e, 0x1f00, 0x1f01, 0x1f02, 0x1f03, 0x1f04, 0x1f05, 0x1f06, 0x1f07,
2603         0x1f20, 0x1f21, 0x1f22, 0x1f23, 0x1f24, 0x1f25, 0x1f26, 0x1f27, 0x1f60, 0x1f61,
2604         0x1f62, 0x1f63, 0x1f64, 0x1f65, 0x1f66, 0x1f67, 0x1f70, 0x1f74, 0x1f7c, 0x110000};
2605 
2606     static const int16_t RECaseFixStringOffsets[] = {
2607         0x0, 0x1, 0x6, 0x7, 0x8, 0x9, 0xd, 0xe, 0xf, 0x10,
2608         0x11, 0x12, 0x13, 0x17, 0x1b, 0x20, 0x21, 0x2a, 0x2e, 0x2f,
2609         0x30, 0x34, 0x35, 0x37, 0x39, 0x3b, 0x3d, 0x3f, 0x41, 0x43,
2610         0x45, 0x47, 0x49, 0x4b, 0x4d, 0x4f, 0x51, 0x53, 0x55, 0x57,
2611         0x59, 0x5b, 0x5d, 0x5f, 0x61, 0x63, 0x65, 0x66, 0x67, 0};
2612 
2613     static const int16_t RECaseFixCounts[] = {
2614         0x1, 0x5, 0x1, 0x1, 0x1, 0x4, 0x1, 0x1, 0x1, 0x1,
2615         0x1, 0x1, 0x4, 0x4, 0x5, 0x1, 0x9, 0x4, 0x1, 0x1,
2616         0x4, 0x1, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2,
2617         0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2,
2618         0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x1, 0x1, 0x1, 0};
2619 
2620     static const UChar RECaseFixData[] = {
2621         0x1e9a, 0xfb00, 0xfb01, 0xfb02, 0xfb03, 0xfb04, 0x1e96, 0x130, 0x1f0, 0xdf,
2622         0x1e9e, 0xfb05, 0xfb06, 0x1e97, 0x1e98, 0x1e99, 0x149, 0x1fb4, 0x1fc4, 0x1fb3,
2623         0x1fb6, 0x1fb7, 0x1fbc, 0x1fc3, 0x1fc6, 0x1fc7, 0x1fcc, 0x390, 0x1fd2, 0x1fd3,
2624         0x1fd6, 0x1fd7, 0x1fe4, 0x3b0, 0x1f50, 0x1f52, 0x1f54, 0x1f56, 0x1fe2, 0x1fe3,
2625         0x1fe6, 0x1fe7, 0x1ff3, 0x1ff6, 0x1ff7, 0x1ffc, 0x1ff4, 0x587, 0xfb13, 0xfb14,
2626         0xfb15, 0xfb17, 0xfb16, 0x1f80, 0x1f88, 0x1f81, 0x1f89, 0x1f82, 0x1f8a, 0x1f83,
2627         0x1f8b, 0x1f84, 0x1f8c, 0x1f85, 0x1f8d, 0x1f86, 0x1f8e, 0x1f87, 0x1f8f, 0x1f90,
2628         0x1f98, 0x1f91, 0x1f99, 0x1f92, 0x1f9a, 0x1f93, 0x1f9b, 0x1f94, 0x1f9c, 0x1f95,
2629         0x1f9d, 0x1f96, 0x1f9e, 0x1f97, 0x1f9f, 0x1fa0, 0x1fa8, 0x1fa1, 0x1fa9, 0x1fa2,
2630         0x1faa, 0x1fa3, 0x1fab, 0x1fa4, 0x1fac, 0x1fa5, 0x1fad, 0x1fa6, 0x1fae, 0x1fa7,
2631         0x1faf, 0x1fb2, 0x1fc2, 0x1ff2, 0};
2632 
2633 // End of machine generated data.
2634 
2635     if (c < UCHAR_MIN_VALUE || c > UCHAR_MAX_VALUE) {
2636         // This function should never be called with an invalid input character.
2637         UPRV_UNREACHABLE;
2638     } else if (u_hasBinaryProperty(c, UCHAR_CASE_SENSITIVE)) {
2639         UChar32 caseFoldedC  = u_foldCase(c, U_FOLD_CASE_DEFAULT);
2640         starterChars->set(caseFoldedC, caseFoldedC);
2641 
2642         int32_t i;
2643         for (i=0; RECaseFixCodePoints[i]<c ; i++) {
2644             // Simple linear search through the sorted list of interesting code points.
2645         }
2646 
2647         if (RECaseFixCodePoints[i] == c) {
2648             int32_t dataIndex = RECaseFixStringOffsets[i];
2649             int32_t numCharsToAdd = RECaseFixCounts[i];
2650             UChar32 cpToAdd = 0;
2651             for (int32_t j=0; j<numCharsToAdd; j++) {
2652                 U16_NEXT_UNSAFE(RECaseFixData, dataIndex, cpToAdd);
2653                 starterChars->add(cpToAdd);
2654             }
2655         }
2656 
2657         starterChars->closeOver(USET_CASE_INSENSITIVE);
2658         starterChars->removeAllStrings();
2659     } else {
2660         // Not a cased character. Just return it alone.
2661         starterChars->set(c, c);
2662     }
2663 }
2664 
2665 
2666 // Increment with overflow check.
2667 // val and delta will both be positive.
2668 
safeIncrement(int32_t val,int32_t delta)2669 static int32_t safeIncrement(int32_t val, int32_t delta) {
2670     if (INT32_MAX - val > delta) {
2671         return val + delta;
2672     } else {
2673         return INT32_MAX;
2674     }
2675 }
2676 
2677 
2678 //------------------------------------------------------------------------------
2679 //
2680 //   matchStartType    Determine how a match can start.
2681 //                     Used to optimize find() operations.
2682 //
2683 //                     Operation is very similar to minMatchLength().  Walk the compiled
2684 //                     pattern, keeping an on-going minimum-match-length.  For any
2685 //                     op where the min match coming in is zero, add that ops possible
2686 //                     starting matches to the possible starts for the overall pattern.
2687 //
2688 //------------------------------------------------------------------------------
matchStartType()2689 void   RegexCompile::matchStartType() {
2690     if (U_FAILURE(*fStatus)) {
2691         return;
2692     }
2693 
2694 
2695     int32_t    loc;                    // Location in the pattern of the current op being processed.
2696     int32_t    op;                     // The op being processed
2697     int32_t    opType;                 // The opcode type of the op
2698     int32_t    currentLen = 0;         // Minimum length of a match to this point (loc) in the pattern
2699     int32_t    numInitialStrings = 0;  // Number of strings encountered that could match at start.
2700 
2701     UBool      atStart = TRUE;         // True if no part of the pattern yet encountered
2702                                        //   could have advanced the position in a match.
2703                                        //   (Maximum match length so far == 0)
2704 
2705     // forwardedLength is a vector holding minimum-match-length values that
2706     //   are propagated forward in the pattern by JMP or STATE_SAVE operations.
2707     //   It must be one longer than the pattern being checked because some  ops
2708     //   will jmp to a end-of-block+1 location from within a block, and we must
2709     //   count those when checking the block.
2710     int32_t end = fRXPat->fCompiledPat->size();
2711     UVector32  forwardedLength(end+1, *fStatus);
2712     forwardedLength.setSize(end+1);
2713     for (loc=3; loc<end; loc++) {
2714         forwardedLength.setElementAt(INT32_MAX, loc);
2715     }
2716 
2717     for (loc = 3; loc<end; loc++) {
2718         op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
2719         opType = URX_TYPE(op);
2720 
2721         // The loop is advancing linearly through the pattern.
2722         // If the op we are now at was the destination of a branch in the pattern,
2723         // and that path has a shorter minimum length than the current accumulated value,
2724         // replace the current accumulated value.
2725         if (forwardedLength.elementAti(loc) < currentLen) {
2726             currentLen = forwardedLength.elementAti(loc);
2727             U_ASSERT(currentLen>=0 && currentLen < INT32_MAX);
2728         }
2729 
2730         switch (opType) {
2731             // Ops that don't change the total length matched
2732         case URX_RESERVED_OP:
2733         case URX_END:
2734         case URX_FAIL:
2735         case URX_STRING_LEN:
2736         case URX_NOP:
2737         case URX_START_CAPTURE:
2738         case URX_END_CAPTURE:
2739         case URX_BACKSLASH_B:
2740         case URX_BACKSLASH_BU:
2741         case URX_BACKSLASH_G:
2742         case URX_BACKSLASH_Z:
2743         case URX_DOLLAR:
2744         case URX_DOLLAR_M:
2745         case URX_DOLLAR_D:
2746         case URX_DOLLAR_MD:
2747         case URX_RELOC_OPRND:
2748         case URX_STO_INP_LOC:
2749         case URX_BACKREF:         // BackRef.  Must assume that it might be a zero length match
2750         case URX_BACKREF_I:
2751 
2752         case URX_STO_SP:          // Setup for atomic or possessive blocks.  Doesn't change what can match.
2753         case URX_LD_SP:
2754             break;
2755 
2756         case URX_CARET:
2757             if (atStart) {
2758                 fRXPat->fStartType = START_START;
2759             }
2760             break;
2761 
2762         case URX_CARET_M:
2763         case URX_CARET_M_UNIX:
2764             if (atStart) {
2765                 fRXPat->fStartType = START_LINE;
2766             }
2767             break;
2768 
2769         case URX_ONECHAR:
2770             if (currentLen == 0) {
2771                 // This character could appear at the start of a match.
2772                 //   Add it to the set of possible starting characters.
2773                 fRXPat->fInitialChars->add(URX_VAL(op));
2774                 numInitialStrings += 2;
2775             }
2776             currentLen = safeIncrement(currentLen, 1);
2777             atStart = FALSE;
2778             break;
2779 
2780 
2781         case URX_SETREF:
2782             if (currentLen == 0) {
2783                 int32_t  sn = URX_VAL(op);
2784                 U_ASSERT(sn > 0 && sn < fRXPat->fSets->size());
2785                 const UnicodeSet *s = (UnicodeSet *)fRXPat->fSets->elementAt(sn);
2786                 fRXPat->fInitialChars->addAll(*s);
2787                 numInitialStrings += 2;
2788             }
2789             currentLen = safeIncrement(currentLen, 1);
2790             atStart = FALSE;
2791             break;
2792 
2793         case URX_LOOP_SR_I:
2794             // [Set]*, like a SETREF, above, in what it can match,
2795             //  but may not match at all, so currentLen is not incremented.
2796             if (currentLen == 0) {
2797                 int32_t  sn = URX_VAL(op);
2798                 U_ASSERT(sn > 0 && sn < fRXPat->fSets->size());
2799                 const UnicodeSet *s = (UnicodeSet *)fRXPat->fSets->elementAt(sn);
2800                 fRXPat->fInitialChars->addAll(*s);
2801                 numInitialStrings += 2;
2802             }
2803             atStart = FALSE;
2804             break;
2805 
2806         case URX_LOOP_DOT_I:
2807             if (currentLen == 0) {
2808                 // .* at the start of a pattern.
2809                 //    Any character can begin the match.
2810                 fRXPat->fInitialChars->clear();
2811                 fRXPat->fInitialChars->complement();
2812                 numInitialStrings += 2;
2813             }
2814             atStart = FALSE;
2815             break;
2816 
2817 
2818         case URX_STATIC_SETREF:
2819             if (currentLen == 0) {
2820                 int32_t  sn = URX_VAL(op);
2821                 U_ASSERT(sn>0 && sn<URX_LAST_SET);
2822                 const UnicodeSet &s = RegexStaticSets::gStaticSets->fPropSets[sn];
2823                 fRXPat->fInitialChars->addAll(s);
2824                 numInitialStrings += 2;
2825             }
2826             currentLen = safeIncrement(currentLen, 1);
2827             atStart = FALSE;
2828             break;
2829 
2830 
2831 
2832         case URX_STAT_SETREF_N:
2833             if (currentLen == 0) {
2834                 int32_t  sn = URX_VAL(op);
2835                 UnicodeSet sc;
2836                 sc.addAll(RegexStaticSets::gStaticSets->fPropSets[sn]).complement();
2837                 fRXPat->fInitialChars->addAll(sc);
2838                 numInitialStrings += 2;
2839             }
2840             currentLen = safeIncrement(currentLen, 1);
2841             atStart = FALSE;
2842             break;
2843 
2844 
2845 
2846         case URX_BACKSLASH_D:
2847             // Digit Char
2848              if (currentLen == 0) {
2849                  UnicodeSet s;
2850                  s.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ND_MASK, *fStatus);
2851                  if (URX_VAL(op) != 0) {
2852                      s.complement();
2853                  }
2854                  fRXPat->fInitialChars->addAll(s);
2855                  numInitialStrings += 2;
2856             }
2857             currentLen = safeIncrement(currentLen, 1);
2858             atStart = FALSE;
2859             break;
2860 
2861 
2862         case URX_BACKSLASH_H:
2863             // Horiz white space
2864             if (currentLen == 0) {
2865                 UnicodeSet s;
2866                 s.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ZS_MASK, *fStatus);
2867                 s.add((UChar32)9);   // Tab
2868                 if (URX_VAL(op) != 0) {
2869                     s.complement();
2870                 }
2871                 fRXPat->fInitialChars->addAll(s);
2872                 numInitialStrings += 2;
2873             }
2874             currentLen = safeIncrement(currentLen, 1);
2875             atStart = FALSE;
2876             break;
2877 
2878 
2879         case URX_BACKSLASH_R:       // Any line ending sequence
2880         case URX_BACKSLASH_V:       // Any line ending code point, with optional negation
2881             if (currentLen == 0) {
2882                 UnicodeSet s;
2883                 s.add((UChar32)0x0a, (UChar32)0x0d);  // add range
2884                 s.add((UChar32)0x85);
2885                 s.add((UChar32)0x2028, (UChar32)0x2029);
2886                 if (URX_VAL(op) != 0) {
2887                      // Complement option applies to URX_BACKSLASH_V only.
2888                      s.complement();
2889                 }
2890                 fRXPat->fInitialChars->addAll(s);
2891                 numInitialStrings += 2;
2892             }
2893             currentLen = safeIncrement(currentLen, 1);
2894             atStart = FALSE;
2895             break;
2896 
2897 
2898 
2899         case URX_ONECHAR_I:
2900             // Case Insensitive Single Character.
2901             if (currentLen == 0) {
2902                 UChar32  c = URX_VAL(op);
2903                 if (u_hasBinaryProperty(c, UCHAR_CASE_SENSITIVE)) {
2904                     UnicodeSet starters(c, c);
2905                     starters.closeOver(USET_CASE_INSENSITIVE);
2906                     // findCaseInsensitiveStarters(c, &starters);
2907                     //   For ONECHAR_I, no need to worry about text chars that expand on folding into strings.
2908                     //   The expanded folding can't match the pattern.
2909                     fRXPat->fInitialChars->addAll(starters);
2910                 } else {
2911                     // Char has no case variants.  Just add it as-is to the
2912                     //   set of possible starting chars.
2913                     fRXPat->fInitialChars->add(c);
2914                 }
2915                 numInitialStrings += 2;
2916             }
2917             currentLen = safeIncrement(currentLen, 1);
2918             atStart = FALSE;
2919             break;
2920 
2921 
2922         case URX_BACKSLASH_X:   // Grahpeme Cluster.  Minimum is 1, max unbounded.
2923         case URX_DOTANY_ALL:    // . matches one or two.
2924         case URX_DOTANY:
2925         case URX_DOTANY_UNIX:
2926             if (currentLen == 0) {
2927                 // These constructs are all bad news when they appear at the start
2928                 //   of a match.  Any character can begin the match.
2929                 fRXPat->fInitialChars->clear();
2930                 fRXPat->fInitialChars->complement();
2931                 numInitialStrings += 2;
2932             }
2933             currentLen = safeIncrement(currentLen, 1);
2934             atStart = FALSE;
2935             break;
2936 
2937 
2938         case URX_JMPX:
2939             loc++;             // Except for extra operand on URX_JMPX, same as URX_JMP.
2940             U_FALLTHROUGH;
2941         case URX_JMP:
2942             {
2943                 int32_t  jmpDest = URX_VAL(op);
2944                 if (jmpDest < loc) {
2945                     // Loop of some kind.  Can safely ignore, the worst that will happen
2946                     //  is that we understate the true minimum length
2947                     currentLen = forwardedLength.elementAti(loc+1);
2948 
2949                 } else {
2950                     // Forward jump.  Propagate the current min length to the target loc of the jump.
2951                     U_ASSERT(jmpDest <= end+1);
2952                     if (forwardedLength.elementAti(jmpDest) > currentLen) {
2953                         forwardedLength.setElementAt(currentLen, jmpDest);
2954                     }
2955                 }
2956             }
2957             atStart = FALSE;
2958             break;
2959 
2960         case URX_JMP_SAV:
2961         case URX_JMP_SAV_X:
2962             // Combo of state save to the next loc, + jmp backwards.
2963             //   Net effect on min. length computation is nothing.
2964             atStart = FALSE;
2965             break;
2966 
2967         case URX_BACKTRACK:
2968             // Fails are kind of like a branch, except that the min length was
2969             //   propagated already, by the state save.
2970             currentLen = forwardedLength.elementAti(loc+1);
2971             atStart = FALSE;
2972             break;
2973 
2974 
2975         case URX_STATE_SAVE:
2976             {
2977                 // State Save, for forward jumps, propagate the current minimum.
2978                 //             of the state save.
2979                 int32_t  jmpDest = URX_VAL(op);
2980                 if (jmpDest > loc) {
2981                     if (currentLen < forwardedLength.elementAti(jmpDest)) {
2982                         forwardedLength.setElementAt(currentLen, jmpDest);
2983                     }
2984                 }
2985             }
2986             atStart = FALSE;
2987             break;
2988 
2989 
2990 
2991 
2992         case URX_STRING:
2993             {
2994                 loc++;
2995                 int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
2996                 int32_t stringLen   = URX_VAL(stringLenOp);
2997                 U_ASSERT(URX_TYPE(stringLenOp) == URX_STRING_LEN);
2998                 U_ASSERT(stringLenOp >= 2);
2999                 if (currentLen == 0) {
3000                     // Add the starting character of this string to the set of possible starting
3001                     //   characters for this pattern.
3002                     int32_t stringStartIdx = URX_VAL(op);
3003                     UChar32  c = fRXPat->fLiteralText.char32At(stringStartIdx);
3004                     fRXPat->fInitialChars->add(c);
3005 
3006                     // Remember this string.  After the entire pattern has been checked,
3007                     //  if nothing else is identified that can start a match, we'll use it.
3008                     numInitialStrings++;
3009                     fRXPat->fInitialStringIdx = stringStartIdx;
3010                     fRXPat->fInitialStringLen = stringLen;
3011                 }
3012 
3013                 currentLen = safeIncrement(currentLen, stringLen);
3014                 atStart = FALSE;
3015             }
3016             break;
3017 
3018         case URX_STRING_I:
3019             {
3020                 // Case-insensitive string.  Unlike exact-match strings, we won't
3021                 //   attempt a string search for possible match positions.  But we
3022                 //   do update the set of possible starting characters.
3023                 loc++;
3024                 int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3025                 int32_t stringLen   = URX_VAL(stringLenOp);
3026                 U_ASSERT(URX_TYPE(stringLenOp) == URX_STRING_LEN);
3027                 U_ASSERT(stringLenOp >= 2);
3028                 if (currentLen == 0) {
3029                     // Add the starting character of this string to the set of possible starting
3030                     //   characters for this pattern.
3031                     int32_t stringStartIdx = URX_VAL(op);
3032                     UChar32  c = fRXPat->fLiteralText.char32At(stringStartIdx);
3033                     UnicodeSet s;
3034                     findCaseInsensitiveStarters(c, &s);
3035                     fRXPat->fInitialChars->addAll(s);
3036                     numInitialStrings += 2;  // Matching on an initial string not possible.
3037                 }
3038                 currentLen = safeIncrement(currentLen, stringLen);
3039                 atStart = FALSE;
3040             }
3041             break;
3042 
3043         case URX_CTR_INIT:
3044         case URX_CTR_INIT_NG:
3045             {
3046                 // Loop Init Ops.  These don't change the min length, but they are 4 word ops
3047                 //   so location must be updated accordingly.
3048                 // Loop Init Ops.
3049                 //   If the min loop count == 0
3050                 //      move loc forwards to the end of the loop, skipping over the body.
3051                 //   If the min count is > 0,
3052                 //      continue normal processing of the body of the loop.
3053                 int32_t loopEndLoc   = (int32_t)fRXPat->fCompiledPat->elementAti(loc+1);
3054                         loopEndLoc   = URX_VAL(loopEndLoc);
3055                 int32_t minLoopCount = (int32_t)fRXPat->fCompiledPat->elementAti(loc+2);
3056                 if (minLoopCount == 0) {
3057                     // Min Loop Count of 0, treat like a forward branch and
3058                     //   move the current minimum length up to the target
3059                     //   (end of loop) location.
3060                     U_ASSERT(loopEndLoc <= end+1);
3061                     if (forwardedLength.elementAti(loopEndLoc) > currentLen) {
3062                         forwardedLength.setElementAt(currentLen, loopEndLoc);
3063                     }
3064                 }
3065                 loc+=3;  // Skips over operands of CTR_INIT
3066             }
3067             atStart = FALSE;
3068             break;
3069 
3070 
3071         case URX_CTR_LOOP:
3072         case URX_CTR_LOOP_NG:
3073             // Loop ops.
3074             //  The jump is conditional, backwards only.
3075             atStart = FALSE;
3076             break;
3077 
3078         case URX_LOOP_C:
3079             // More loop ops.  These state-save to themselves.
3080             //   don't change the minimum match
3081             atStart = FALSE;
3082             break;
3083 
3084 
3085         case URX_LA_START:
3086         case URX_LB_START:
3087             {
3088                 // Look-around.  Scan forward until the matching look-ahead end,
3089                 //   without processing the look-around block.  This is overly pessimistic.
3090 
3091                 // Keep track of the nesting depth of look-around blocks.  Boilerplate code for
3092                 //   lookahead contains two LA_END instructions, so count goes up by two
3093                 //   for each LA_START.
3094                 int32_t  depth = (opType == URX_LA_START? 2: 1);
3095                 for (;;) {
3096                     loc++;
3097                     op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3098                     if (URX_TYPE(op) == URX_LA_START) {
3099                         depth+=2;
3100                     }
3101                     if (URX_TYPE(op) == URX_LB_START) {
3102                         depth++;
3103                     }
3104                     if (URX_TYPE(op) == URX_LA_END || URX_TYPE(op)==URX_LBN_END) {
3105                         depth--;
3106                         if (depth == 0) {
3107                             break;
3108                         }
3109                     }
3110                     if (URX_TYPE(op) == URX_STATE_SAVE) {
3111                         // Need this because neg lookahead blocks will FAIL to outside
3112                         //   of the block.
3113                         int32_t  jmpDest = URX_VAL(op);
3114                         if (jmpDest > loc) {
3115                             if (currentLen < forwardedLength.elementAti(jmpDest)) {
3116                                 forwardedLength.setElementAt(currentLen, jmpDest);
3117                             }
3118                         }
3119                     }
3120                     U_ASSERT(loc <= end);
3121                 }
3122             }
3123             break;
3124 
3125         case URX_LA_END:
3126         case URX_LB_CONT:
3127         case URX_LB_END:
3128         case URX_LBN_CONT:
3129         case URX_LBN_END:
3130             UPRV_UNREACHABLE;     // Shouldn't get here.  These ops should be
3131                                  //  consumed by the scan in URX_LA_START and LB_START
3132         default:
3133             UPRV_UNREACHABLE;
3134             }
3135 
3136         }
3137 
3138 
3139     // We have finished walking through the ops.  Check whether some forward jump
3140     //   propagated a shorter length to location end+1.
3141     if (forwardedLength.elementAti(end+1) < currentLen) {
3142         currentLen = forwardedLength.elementAti(end+1);
3143     }
3144 
3145 
3146     fRXPat->fInitialChars8->init(fRXPat->fInitialChars);
3147 
3148 
3149     // Sort out what we should check for when looking for candidate match start positions.
3150     // In order of preference,
3151     //     1.   Start of input text buffer.
3152     //     2.   A literal string.
3153     //     3.   Start of line in multi-line mode.
3154     //     4.   A single literal character.
3155     //     5.   A character from a set of characters.
3156     //
3157     if (fRXPat->fStartType == START_START) {
3158         // Match only at the start of an input text string.
3159         //    start type is already set.  We're done.
3160     } else if (numInitialStrings == 1 && fRXPat->fMinMatchLen > 0) {
3161         // Match beginning only with a literal string.
3162         UChar32  c = fRXPat->fLiteralText.char32At(fRXPat->fInitialStringIdx);
3163         U_ASSERT(fRXPat->fInitialChars->contains(c));
3164         fRXPat->fStartType   = START_STRING;
3165         fRXPat->fInitialChar = c;
3166     } else if (fRXPat->fStartType == START_LINE) {
3167         // Match at start of line in Multi-Line mode.
3168         // Nothing to do here; everything is already set.
3169     } else if (fRXPat->fMinMatchLen == 0) {
3170         // Zero length match possible.  We could start anywhere.
3171         fRXPat->fStartType = START_NO_INFO;
3172     } else if (fRXPat->fInitialChars->size() == 1) {
3173         // All matches begin with the same char.
3174         fRXPat->fStartType   = START_CHAR;
3175         fRXPat->fInitialChar = fRXPat->fInitialChars->charAt(0);
3176         U_ASSERT(fRXPat->fInitialChar != (UChar32)-1);
3177     } else if (fRXPat->fInitialChars->contains((UChar32)0, (UChar32)0x10ffff) == FALSE &&
3178         fRXPat->fMinMatchLen > 0) {
3179         // Matches start with a set of character smaller than the set of all chars.
3180         fRXPat->fStartType = START_SET;
3181     } else {
3182         // Matches can start with anything
3183         fRXPat->fStartType = START_NO_INFO;
3184     }
3185 
3186     return;
3187 }
3188 
3189 
3190 
3191 //------------------------------------------------------------------------------
3192 //
3193 //   minMatchLength    Calculate the length of the shortest string that could
3194 //                     match the specified pattern.
3195 //                     Length is in 16 bit code units, not code points.
3196 //
3197 //                     The calculated length may not be exact.  The returned
3198 //                     value may be shorter than the actual minimum; it must
3199 //                     never be longer.
3200 //
3201 //                     start and end are the range of p-code operations to be
3202 //                     examined.  The endpoints are included in the range.
3203 //
3204 //------------------------------------------------------------------------------
minMatchLength(int32_t start,int32_t end)3205 int32_t   RegexCompile::minMatchLength(int32_t start, int32_t end) {
3206     if (U_FAILURE(*fStatus)) {
3207         return 0;
3208     }
3209 
3210     U_ASSERT(start <= end);
3211     U_ASSERT(end < fRXPat->fCompiledPat->size());
3212 
3213 
3214     int32_t    loc;
3215     int32_t    op;
3216     int32_t    opType;
3217     int32_t    currentLen = 0;
3218 
3219 
3220     // forwardedLength is a vector holding minimum-match-length values that
3221     //   are propagated forward in the pattern by JMP or STATE_SAVE operations.
3222     //   It must be one longer than the pattern being checked because some  ops
3223     //   will jmp to a end-of-block+1 location from within a block, and we must
3224     //   count those when checking the block.
3225     UVector32  forwardedLength(end+2, *fStatus);
3226     forwardedLength.setSize(end+2);
3227     for (loc=start; loc<=end+1; loc++) {
3228         forwardedLength.setElementAt(INT32_MAX, loc);
3229     }
3230 
3231     for (loc = start; loc<=end; loc++) {
3232         op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3233         opType = URX_TYPE(op);
3234 
3235         // The loop is advancing linearly through the pattern.
3236         // If the op we are now at was the destination of a branch in the pattern,
3237         // and that path has a shorter minimum length than the current accumulated value,
3238         // replace the current accumulated value.
3239         // U_ASSERT(currentLen>=0 && currentLen < INT32_MAX);  // MinLength == INT32_MAX for some
3240                                                                //   no-match-possible cases.
3241         if (forwardedLength.elementAti(loc) < currentLen) {
3242             currentLen = forwardedLength.elementAti(loc);
3243             U_ASSERT(currentLen>=0 && currentLen < INT32_MAX);
3244         }
3245 
3246         switch (opType) {
3247             // Ops that don't change the total length matched
3248         case URX_RESERVED_OP:
3249         case URX_END:
3250         case URX_STRING_LEN:
3251         case URX_NOP:
3252         case URX_START_CAPTURE:
3253         case URX_END_CAPTURE:
3254         case URX_BACKSLASH_B:
3255         case URX_BACKSLASH_BU:
3256         case URX_BACKSLASH_G:
3257         case URX_BACKSLASH_Z:
3258         case URX_CARET:
3259         case URX_DOLLAR:
3260         case URX_DOLLAR_M:
3261         case URX_DOLLAR_D:
3262         case URX_DOLLAR_MD:
3263         case URX_RELOC_OPRND:
3264         case URX_STO_INP_LOC:
3265         case URX_CARET_M:
3266         case URX_CARET_M_UNIX:
3267         case URX_BACKREF:         // BackRef.  Must assume that it might be a zero length match
3268         case URX_BACKREF_I:
3269 
3270         case URX_STO_SP:          // Setup for atomic or possessive blocks.  Doesn't change what can match.
3271         case URX_LD_SP:
3272 
3273         case URX_JMP_SAV:
3274         case URX_JMP_SAV_X:
3275             break;
3276 
3277 
3278             // Ops that match a minimum of one character (one or two 16 bit code units.)
3279             //
3280         case URX_ONECHAR:
3281         case URX_STATIC_SETREF:
3282         case URX_STAT_SETREF_N:
3283         case URX_SETREF:
3284         case URX_BACKSLASH_D:
3285         case URX_BACKSLASH_H:
3286         case URX_BACKSLASH_R:
3287         case URX_BACKSLASH_V:
3288         case URX_ONECHAR_I:
3289         case URX_BACKSLASH_X:   // Grahpeme Cluster.  Minimum is 1, max unbounded.
3290         case URX_DOTANY_ALL:    // . matches one or two.
3291         case URX_DOTANY:
3292         case URX_DOTANY_UNIX:
3293             currentLen = safeIncrement(currentLen, 1);
3294             break;
3295 
3296 
3297         case URX_JMPX:
3298             loc++;              // URX_JMPX has an extra operand, ignored here,
3299                                 //   otherwise processed identically to URX_JMP.
3300             U_FALLTHROUGH;
3301         case URX_JMP:
3302             {
3303                 int32_t  jmpDest = URX_VAL(op);
3304                 if (jmpDest < loc) {
3305                     // Loop of some kind.  Can safely ignore, the worst that will happen
3306                     //  is that we understate the true minimum length
3307                     currentLen = forwardedLength.elementAti(loc+1);
3308                 } else {
3309                     // Forward jump.  Propagate the current min length to the target loc of the jump.
3310                     U_ASSERT(jmpDest <= end+1);
3311                     if (forwardedLength.elementAti(jmpDest) > currentLen) {
3312                         forwardedLength.setElementAt(currentLen, jmpDest);
3313                     }
3314                 }
3315             }
3316             break;
3317 
3318         case URX_BACKTRACK:
3319             {
3320                 // Back-tracks are kind of like a branch, except that the min length was
3321                 //   propagated already, by the state save.
3322                 currentLen = forwardedLength.elementAti(loc+1);
3323             }
3324             break;
3325 
3326 
3327         case URX_STATE_SAVE:
3328             {
3329                 // State Save, for forward jumps, propagate the current minimum.
3330                 //             of the state save.
3331                 int32_t  jmpDest = URX_VAL(op);
3332                 if (jmpDest > loc) {
3333                     if (currentLen < forwardedLength.elementAti(jmpDest)) {
3334                         forwardedLength.setElementAt(currentLen, jmpDest);
3335                     }
3336                 }
3337             }
3338             break;
3339 
3340 
3341         case URX_STRING:
3342             {
3343                 loc++;
3344                 int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3345                 currentLen = safeIncrement(currentLen, URX_VAL(stringLenOp));
3346             }
3347             break;
3348 
3349 
3350         case URX_STRING_I:
3351             {
3352                 loc++;
3353                 // TODO: with full case folding, matching input text may be shorter than
3354                 //       the string we have here.  More smarts could put some bounds on it.
3355                 //       Assume a min length of one for now.  A min length of zero causes
3356                 //        optimization failures for a pattern like "string"+
3357                 // currentLen += URX_VAL(stringLenOp);
3358                 currentLen = safeIncrement(currentLen, 1);
3359             }
3360             break;
3361 
3362         case URX_CTR_INIT:
3363         case URX_CTR_INIT_NG:
3364             {
3365                 // Loop Init Ops.
3366                 //   If the min loop count == 0
3367                 //      move loc forwards to the end of the loop, skipping over the body.
3368                 //   If the min count is > 0,
3369                 //      continue normal processing of the body of the loop.
3370                 int32_t loopEndLoc   = (int32_t)fRXPat->fCompiledPat->elementAti(loc+1);
3371                         loopEndLoc   = URX_VAL(loopEndLoc);
3372                 int32_t minLoopCount = (int32_t)fRXPat->fCompiledPat->elementAti(loc+2);
3373                 if (minLoopCount == 0) {
3374                     loc = loopEndLoc;
3375                 } else {
3376                     loc+=3;  // Skips over operands of CTR_INIT
3377                 }
3378             }
3379             break;
3380 
3381 
3382         case URX_CTR_LOOP:
3383         case URX_CTR_LOOP_NG:
3384             // Loop ops.
3385             //  The jump is conditional, backwards only.
3386             break;
3387 
3388         case URX_LOOP_SR_I:
3389         case URX_LOOP_DOT_I:
3390         case URX_LOOP_C:
3391             // More loop ops.  These state-save to themselves.
3392             //   don't change the minimum match - could match nothing at all.
3393             break;
3394 
3395 
3396         case URX_LA_START:
3397         case URX_LB_START:
3398             {
3399                 // Look-around.  Scan forward until the matching look-ahead end,
3400                 //   without processing the look-around block.  This is overly pessimistic for look-ahead,
3401                 //   it assumes that the look-ahead match might be zero-length.
3402                 //   TODO:  Positive lookahead could recursively do the block, then continue
3403                 //          with the longer of the block or the value coming in.  Ticket 6060
3404                 int32_t  depth = (opType == URX_LA_START? 2: 1);
3405                 for (;;) {
3406                     loc++;
3407                     op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3408                     if (URX_TYPE(op) == URX_LA_START) {
3409                         // The boilerplate for look-ahead includes two LA_END insturctions,
3410                         //    Depth will be decremented by each one when it is seen.
3411                         depth += 2;
3412                     }
3413                     if (URX_TYPE(op) == URX_LB_START) {
3414                         depth++;
3415                     }
3416                     if (URX_TYPE(op) == URX_LA_END) {
3417                         depth--;
3418                         if (depth == 0) {
3419                             break;
3420                         }
3421                     }
3422                     if (URX_TYPE(op)==URX_LBN_END) {
3423                         depth--;
3424                         if (depth == 0) {
3425                             break;
3426                         }
3427                     }
3428                     if (URX_TYPE(op) == URX_STATE_SAVE) {
3429                         // Need this because neg lookahead blocks will FAIL to outside
3430                         //   of the block.
3431                         int32_t  jmpDest = URX_VAL(op);
3432                         if (jmpDest > loc) {
3433                             if (currentLen < forwardedLength.elementAti(jmpDest)) {
3434                                 forwardedLength.setElementAt(currentLen, jmpDest);
3435                             }
3436                         }
3437                     }
3438                     U_ASSERT(loc <= end);
3439                 }
3440             }
3441             break;
3442 
3443         case URX_LA_END:
3444         case URX_LB_CONT:
3445         case URX_LB_END:
3446         case URX_LBN_CONT:
3447         case URX_LBN_END:
3448             // Only come here if the matching URX_LA_START or URX_LB_START was not in the
3449             //   range being sized, which happens when measuring size of look-behind blocks.
3450             break;
3451 
3452         default:
3453             UPRV_UNREACHABLE;
3454             }
3455 
3456         }
3457 
3458     // We have finished walking through the ops.  Check whether some forward jump
3459     //   propagated a shorter length to location end+1.
3460     if (forwardedLength.elementAti(end+1) < currentLen) {
3461         currentLen = forwardedLength.elementAti(end+1);
3462         U_ASSERT(currentLen>=0 && currentLen < INT32_MAX);
3463     }
3464 
3465     return currentLen;
3466 }
3467 
3468 //------------------------------------------------------------------------------
3469 //
3470 //   maxMatchLength    Calculate the length of the longest string that could
3471 //                     match the specified pattern.
3472 //                     Length is in 16 bit code units, not code points.
3473 //
3474 //                     The calculated length may not be exact.  The returned
3475 //                     value may be longer than the actual maximum; it must
3476 //                     never be shorter.
3477 //
3478 //------------------------------------------------------------------------------
maxMatchLength(int32_t start,int32_t end)3479 int32_t   RegexCompile::maxMatchLength(int32_t start, int32_t end) {
3480     if (U_FAILURE(*fStatus)) {
3481         return 0;
3482     }
3483     U_ASSERT(start <= end);
3484     U_ASSERT(end < fRXPat->fCompiledPat->size());
3485 
3486     int32_t    loc;
3487     int32_t    op;
3488     int32_t    opType;
3489     int32_t    currentLen = 0;
3490     UVector32  forwardedLength(end+1, *fStatus);
3491     forwardedLength.setSize(end+1);
3492 
3493     for (loc=start; loc<=end; loc++) {
3494         forwardedLength.setElementAt(0, loc);
3495     }
3496 
3497     for (loc = start; loc<=end; loc++) {
3498         op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3499         opType = URX_TYPE(op);
3500 
3501         // The loop is advancing linearly through the pattern.
3502         // If the op we are now at was the destination of a branch in the pattern,
3503         // and that path has a longer maximum length than the current accumulated value,
3504         // replace the current accumulated value.
3505         if (forwardedLength.elementAti(loc) > currentLen) {
3506             currentLen = forwardedLength.elementAti(loc);
3507         }
3508 
3509         switch (opType) {
3510             // Ops that don't change the total length matched
3511         case URX_RESERVED_OP:
3512         case URX_END:
3513         case URX_STRING_LEN:
3514         case URX_NOP:
3515         case URX_START_CAPTURE:
3516         case URX_END_CAPTURE:
3517         case URX_BACKSLASH_B:
3518         case URX_BACKSLASH_BU:
3519         case URX_BACKSLASH_G:
3520         case URX_BACKSLASH_Z:
3521         case URX_CARET:
3522         case URX_DOLLAR:
3523         case URX_DOLLAR_M:
3524         case URX_DOLLAR_D:
3525         case URX_DOLLAR_MD:
3526         case URX_RELOC_OPRND:
3527         case URX_STO_INP_LOC:
3528         case URX_CARET_M:
3529         case URX_CARET_M_UNIX:
3530 
3531         case URX_STO_SP:          // Setup for atomic or possessive blocks.  Doesn't change what can match.
3532         case URX_LD_SP:
3533 
3534         case URX_LB_END:
3535         case URX_LB_CONT:
3536         case URX_LBN_CONT:
3537         case URX_LBN_END:
3538             break;
3539 
3540 
3541             // Ops that increase that cause an unbounded increase in the length
3542             //   of a matched string, or that increase it a hard to characterize way.
3543             //   Call the max length unbounded, and stop further checking.
3544         case URX_BACKREF:         // BackRef.  Must assume that it might be a zero length match
3545         case URX_BACKREF_I:
3546         case URX_BACKSLASH_X:   // Grahpeme Cluster.  Minimum is 1, max unbounded.
3547             currentLen = INT32_MAX;
3548             break;
3549 
3550 
3551             // Ops that match a max of one character (possibly two 16 bit code units.)
3552             //
3553         case URX_STATIC_SETREF:
3554         case URX_STAT_SETREF_N:
3555         case URX_SETREF:
3556         case URX_BACKSLASH_D:
3557         case URX_BACKSLASH_H:
3558         case URX_BACKSLASH_R:
3559         case URX_BACKSLASH_V:
3560         case URX_ONECHAR_I:
3561         case URX_DOTANY_ALL:
3562         case URX_DOTANY:
3563         case URX_DOTANY_UNIX:
3564             currentLen = safeIncrement(currentLen, 2);
3565             break;
3566 
3567             // Single literal character.  Increase current max length by one or two,
3568             //       depending on whether the char is in the supplementary range.
3569         case URX_ONECHAR:
3570             currentLen = safeIncrement(currentLen, 1);
3571             if (URX_VAL(op) > 0x10000) {
3572                 currentLen = safeIncrement(currentLen, 1);
3573             }
3574             break;
3575 
3576             // Jumps.
3577             //
3578         case URX_JMP:
3579         case URX_JMPX:
3580         case URX_JMP_SAV:
3581         case URX_JMP_SAV_X:
3582             {
3583                 int32_t  jmpDest = URX_VAL(op);
3584                 if (jmpDest < loc) {
3585                     // Loop of some kind.  Max match length is unbounded.
3586                     currentLen = INT32_MAX;
3587                 } else {
3588                     // Forward jump.  Propagate the current min length to the target loc of the jump.
3589                     if (forwardedLength.elementAti(jmpDest) < currentLen) {
3590                         forwardedLength.setElementAt(currentLen, jmpDest);
3591                     }
3592                     currentLen = 0;
3593                 }
3594             }
3595             break;
3596 
3597         case URX_BACKTRACK:
3598             // back-tracks are kind of like a branch, except that the max length was
3599             //   propagated already, by the state save.
3600             currentLen = forwardedLength.elementAti(loc+1);
3601             break;
3602 
3603 
3604         case URX_STATE_SAVE:
3605             {
3606                 // State Save, for forward jumps, propagate the current minimum.
3607                 //               of the state save.
3608                 //             For backwards jumps, they create a loop, maximum
3609                 //               match length is unbounded.
3610                 int32_t  jmpDest = URX_VAL(op);
3611                 if (jmpDest > loc) {
3612                     if (currentLen > forwardedLength.elementAti(jmpDest)) {
3613                         forwardedLength.setElementAt(currentLen, jmpDest);
3614                     }
3615                 } else {
3616                     currentLen = INT32_MAX;
3617                 }
3618             }
3619             break;
3620 
3621 
3622 
3623 
3624         case URX_STRING:
3625             {
3626                 loc++;
3627                 int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3628                 currentLen = safeIncrement(currentLen, URX_VAL(stringLenOp));
3629                 break;
3630             }
3631 
3632         case URX_STRING_I:
3633             // TODO:  This code assumes that any user string that matches will be no longer
3634             //        than our compiled string, with case insensitive matching.
3635             //        Our compiled string has been case-folded already.
3636             //
3637             //        Any matching user string will have no more code points than our
3638             //        compiled (folded) string.  Folding may add code points, but
3639             //        not remove them.
3640             //
3641             //        There is a potential problem if a supplemental code point
3642             //        case-folds to a BMP code point.  In this case our compiled string
3643             //        could be shorter (in code units) than a matching user string.
3644             //
3645             //        At this time (Unicode 6.1) there are no such characters, and this case
3646             //        is not being handled.  A test, intltest regex/Bug9283, will fail if
3647             //        any problematic characters are added to Unicode.
3648             //
3649             //        If this happens, we can make a set of the BMP chars that the
3650             //        troublesome supplementals fold to, scan our string, and bump the
3651             //        currentLen one extra for each that is found.
3652             //
3653             {
3654                 loc++;
3655                 int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3656                 currentLen = safeIncrement(currentLen, URX_VAL(stringLenOp));
3657             }
3658             break;
3659 
3660         case URX_CTR_INIT:
3661         case URX_CTR_INIT_NG:
3662             // For Loops, recursively call this function on the pattern for the loop body,
3663             //   then multiply the result by the maximum loop count.
3664             {
3665                 int32_t  loopEndLoc = URX_VAL(fRXPat->fCompiledPat->elementAti(loc+1));
3666                 if (loopEndLoc == loc+4) {
3667                     // Loop has an empty body. No affect on max match length.
3668                     // Continue processing with code after the loop end.
3669                     loc = loopEndLoc;
3670                     break;
3671                 }
3672 
3673                 int32_t maxLoopCount = static_cast<int32_t>(fRXPat->fCompiledPat->elementAti(loc+3));
3674                 if (maxLoopCount == -1) {
3675                     // Unbounded Loop. No upper bound on match length.
3676                     currentLen = INT32_MAX;
3677                     break;
3678                 }
3679 
3680                 U_ASSERT(loopEndLoc >= loc+4);
3681                 int64_t blockLen = maxMatchLength(loc+4, loopEndLoc-1);  // Recursive call.
3682                 int64_t updatedLen = (int64_t)currentLen + blockLen * maxLoopCount;
3683                 if (updatedLen >= INT32_MAX) {
3684                     currentLen = INT32_MAX;
3685                     break;
3686                 }
3687                 currentLen = (int32_t)updatedLen;
3688                 loc = loopEndLoc;
3689                 break;
3690             }
3691 
3692         case URX_CTR_LOOP:
3693         case URX_CTR_LOOP_NG:
3694             // These opcodes will be skipped over by code for URX_CTR_INIT.
3695             // We shouldn't encounter them here.
3696             UPRV_UNREACHABLE;
3697 
3698         case URX_LOOP_SR_I:
3699         case URX_LOOP_DOT_I:
3700         case URX_LOOP_C:
3701             // For anything to do with loops, make the match length unbounded.
3702             currentLen = INT32_MAX;
3703             break;
3704 
3705 
3706 
3707         case URX_LA_START:
3708         case URX_LA_END:
3709             // Look-ahead.  Just ignore, treat the look-ahead block as if
3710             // it were normal pattern.  Gives a too-long match length,
3711             //  but good enough for now.
3712             break;
3713 
3714             // End of look-ahead ops should always be consumed by the processing at
3715             //  the URX_LA_START op.
3716             // UPRV_UNREACHABLE;
3717 
3718         case URX_LB_START:
3719             {
3720                 // Look-behind.  Scan forward until the matching look-around end,
3721                 //   without processing the look-behind block.
3722                 int32_t dataLoc = URX_VAL(op);
3723                 for (loc = loc + 1; loc < end; ++loc) {
3724                     op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3725                     int32_t opType = URX_TYPE(op);
3726                     if ((opType == URX_LA_END || opType == URX_LBN_END) && (URX_VAL(op) == dataLoc)) {
3727                         break;
3728                     }
3729                 }
3730                 U_ASSERT(loc < end);
3731             }
3732             break;
3733 
3734         default:
3735             UPRV_UNREACHABLE;
3736         }
3737 
3738 
3739         if (currentLen == INT32_MAX) {
3740             //  The maximum length is unbounded.
3741             //  Stop further processing of the pattern.
3742             break;
3743         }
3744 
3745     }
3746     return currentLen;
3747 
3748 }
3749 
3750 
3751 //------------------------------------------------------------------------------
3752 //
3753 //   stripNOPs    Remove any NOP operations from the compiled pattern code.
3754 //                Extra NOPs are inserted for some constructs during the initial
3755 //                code generation to provide locations that may be patched later.
3756 //                Many end up unneeded, and are removed by this function.
3757 //
3758 //                In order to minimize the number of passes through the pattern,
3759 //                back-reference fixup is also performed here (adjusting
3760 //                back-reference operands to point to the correct frame offsets).
3761 //
3762 //------------------------------------------------------------------------------
stripNOPs()3763 void RegexCompile::stripNOPs() {
3764 
3765     if (U_FAILURE(*fStatus)) {
3766         return;
3767     }
3768 
3769     int32_t    end = fRXPat->fCompiledPat->size();
3770     UVector32  deltas(end, *fStatus);
3771 
3772     // Make a first pass over the code, computing the amount that things
3773     //   will be offset at each location in the original code.
3774     int32_t   loc;
3775     int32_t   d = 0;
3776     for (loc=0; loc<end; loc++) {
3777         deltas.addElement(d, *fStatus);
3778         int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3779         if (URX_TYPE(op) == URX_NOP) {
3780             d++;
3781         }
3782     }
3783 
3784     UnicodeString caseStringBuffer;
3785 
3786     // Make a second pass over the code, removing the NOPs by moving following
3787     //  code up, and patching operands that refer to code locations that
3788     //  are being moved.  The array of offsets from the first step is used
3789     //  to compute the new operand values.
3790     int32_t src;
3791     int32_t dst = 0;
3792     for (src=0; src<end; src++) {
3793         int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(src);
3794         int32_t opType = URX_TYPE(op);
3795         switch (opType) {
3796         case URX_NOP:
3797             break;
3798 
3799         case URX_STATE_SAVE:
3800         case URX_JMP:
3801         case URX_CTR_LOOP:
3802         case URX_CTR_LOOP_NG:
3803         case URX_RELOC_OPRND:
3804         case URX_JMPX:
3805         case URX_JMP_SAV:
3806         case URX_JMP_SAV_X:
3807             // These are instructions with operands that refer to code locations.
3808             {
3809                 int32_t  operandAddress = URX_VAL(op);
3810                 U_ASSERT(operandAddress>=0 && operandAddress<deltas.size());
3811                 int32_t fixedOperandAddress = operandAddress - deltas.elementAti(operandAddress);
3812                 op = buildOp(opType, fixedOperandAddress);
3813                 fRXPat->fCompiledPat->setElementAt(op, dst);
3814                 dst++;
3815                 break;
3816             }
3817 
3818         case URX_BACKREF:
3819         case URX_BACKREF_I:
3820             {
3821                 int32_t where = URX_VAL(op);
3822                 if (where > fRXPat->fGroupMap->size()) {
3823                     error(U_REGEX_INVALID_BACK_REF);
3824                     break;
3825                 }
3826                 where = fRXPat->fGroupMap->elementAti(where-1);
3827                 op    = buildOp(opType, where);
3828                 fRXPat->fCompiledPat->setElementAt(op, dst);
3829                 dst++;
3830 
3831                 fRXPat->fNeedsAltInput = TRUE;
3832                 break;
3833             }
3834         case URX_RESERVED_OP:
3835         case URX_RESERVED_OP_N:
3836         case URX_BACKTRACK:
3837         case URX_END:
3838         case URX_ONECHAR:
3839         case URX_STRING:
3840         case URX_STRING_LEN:
3841         case URX_START_CAPTURE:
3842         case URX_END_CAPTURE:
3843         case URX_STATIC_SETREF:
3844         case URX_STAT_SETREF_N:
3845         case URX_SETREF:
3846         case URX_DOTANY:
3847         case URX_FAIL:
3848         case URX_BACKSLASH_B:
3849         case URX_BACKSLASH_BU:
3850         case URX_BACKSLASH_G:
3851         case URX_BACKSLASH_X:
3852         case URX_BACKSLASH_Z:
3853         case URX_DOTANY_ALL:
3854         case URX_BACKSLASH_D:
3855         case URX_CARET:
3856         case URX_DOLLAR:
3857         case URX_CTR_INIT:
3858         case URX_CTR_INIT_NG:
3859         case URX_DOTANY_UNIX:
3860         case URX_STO_SP:
3861         case URX_LD_SP:
3862         case URX_STO_INP_LOC:
3863         case URX_LA_START:
3864         case URX_LA_END:
3865         case URX_ONECHAR_I:
3866         case URX_STRING_I:
3867         case URX_DOLLAR_M:
3868         case URX_CARET_M:
3869         case URX_CARET_M_UNIX:
3870         case URX_LB_START:
3871         case URX_LB_CONT:
3872         case URX_LB_END:
3873         case URX_LBN_CONT:
3874         case URX_LBN_END:
3875         case URX_LOOP_SR_I:
3876         case URX_LOOP_DOT_I:
3877         case URX_LOOP_C:
3878         case URX_DOLLAR_D:
3879         case URX_DOLLAR_MD:
3880         case URX_BACKSLASH_H:
3881         case URX_BACKSLASH_R:
3882         case URX_BACKSLASH_V:
3883             // These instructions are unaltered by the relocation.
3884             fRXPat->fCompiledPat->setElementAt(op, dst);
3885             dst++;
3886             break;
3887 
3888         default:
3889             // Some op is unaccounted for.
3890             UPRV_UNREACHABLE;
3891         }
3892     }
3893 
3894     fRXPat->fCompiledPat->setSize(dst);
3895 }
3896 
3897 
3898 
3899 
3900 //------------------------------------------------------------------------------
3901 //
3902 //  Error         Report a rule parse error.
3903 //                Only report it if no previous error has been recorded.
3904 //
3905 //------------------------------------------------------------------------------
error(UErrorCode e)3906 void RegexCompile::error(UErrorCode e) {
3907     if (U_SUCCESS(*fStatus) || e == U_MEMORY_ALLOCATION_ERROR) {
3908         *fStatus = e;
3909         // Hmm. fParseErr (UParseError) line & offset fields are int32_t in public
3910         // API (see common/unicode/parseerr.h), while fLineNum and fCharNum are
3911         // int64_t. If the values of the latter are out of range for the former,
3912         // set them to the appropriate "field not supported" values.
3913         if (fLineNum > 0x7FFFFFFF) {
3914             fParseErr->line   = 0;
3915             fParseErr->offset = -1;
3916         } else if (fCharNum > 0x7FFFFFFF) {
3917             fParseErr->line   = (int32_t)fLineNum;
3918             fParseErr->offset = -1;
3919         } else {
3920             fParseErr->line   = (int32_t)fLineNum;
3921             fParseErr->offset = (int32_t)fCharNum;
3922         }
3923 
3924         UErrorCode status = U_ZERO_ERROR; // throwaway status for extracting context
3925 
3926         // Fill in the context.
3927         //   Note: extractBetween() pins supplied indicies to the string bounds.
3928         uprv_memset(fParseErr->preContext,  0, sizeof(fParseErr->preContext));
3929         uprv_memset(fParseErr->postContext, 0, sizeof(fParseErr->postContext));
3930         utext_extract(fRXPat->fPattern, fScanIndex-U_PARSE_CONTEXT_LEN+1, fScanIndex, fParseErr->preContext, U_PARSE_CONTEXT_LEN, &status);
3931         utext_extract(fRXPat->fPattern, fScanIndex, fScanIndex+U_PARSE_CONTEXT_LEN-1, fParseErr->postContext, U_PARSE_CONTEXT_LEN, &status);
3932     }
3933 }
3934 
3935 
3936 //
3937 //  Assorted Unicode character constants.
3938 //     Numeric because there is no portable way to enter them as literals.
3939 //     (Think EBCDIC).
3940 //
3941 static const UChar      chCR        = 0x0d;      // New lines, for terminating comments.
3942 static const UChar      chLF        = 0x0a;      // Line Feed
3943 static const UChar      chPound     = 0x23;      // '#', introduces a comment.
3944 static const UChar      chDigit0    = 0x30;      // '0'
3945 static const UChar      chDigit7    = 0x37;      // '9'
3946 static const UChar      chColon     = 0x3A;      // ':'
3947 static const UChar      chE         = 0x45;      // 'E'
3948 static const UChar      chQ         = 0x51;      // 'Q'
3949 //static const UChar      chN         = 0x4E;      // 'N'
3950 static const UChar      chP         = 0x50;      // 'P'
3951 static const UChar      chBackSlash = 0x5c;      // '\'  introduces a char escape
3952 //static const UChar      chLBracket  = 0x5b;      // '['
3953 static const UChar      chRBracket  = 0x5d;      // ']'
3954 static const UChar      chUp        = 0x5e;      // '^'
3955 static const UChar      chLowerP    = 0x70;
3956 static const UChar      chLBrace    = 0x7b;      // '{'
3957 static const UChar      chRBrace    = 0x7d;      // '}'
3958 static const UChar      chNEL       = 0x85;      //    NEL newline variant
3959 static const UChar      chLS        = 0x2028;    //    Unicode Line Separator
3960 
3961 
3962 //------------------------------------------------------------------------------
3963 //
3964 //  nextCharLL    Low Level Next Char from the regex pattern.
3965 //                Get a char from the string, keep track of input position
3966 //                     for error reporting.
3967 //
3968 //------------------------------------------------------------------------------
nextCharLL()3969 UChar32  RegexCompile::nextCharLL() {
3970     UChar32       ch;
3971 
3972     if (fPeekChar != -1) {
3973         ch = fPeekChar;
3974         fPeekChar = -1;
3975         return ch;
3976     }
3977 
3978     // assume we're already in the right place
3979     ch = UTEXT_NEXT32(fRXPat->fPattern);
3980     if (ch == U_SENTINEL) {
3981         return ch;
3982     }
3983 
3984     if (ch == chCR ||
3985         ch == chNEL ||
3986         ch == chLS   ||
3987         (ch == chLF && fLastChar != chCR)) {
3988         // Character is starting a new line.  Bump up the line number, and
3989         //  reset the column to 0.
3990         fLineNum++;
3991         fCharNum=0;
3992     }
3993     else {
3994         // Character is not starting a new line.  Except in the case of a
3995         //   LF following a CR, increment the column position.
3996         if (ch != chLF) {
3997             fCharNum++;
3998         }
3999     }
4000     fLastChar = ch;
4001     return ch;
4002 }
4003 
4004 //------------------------------------------------------------------------------
4005 //
4006 //   peekCharLL    Low Level Character Scanning, sneak a peek at the next
4007 //                 character without actually getting it.
4008 //
4009 //------------------------------------------------------------------------------
peekCharLL()4010 UChar32  RegexCompile::peekCharLL() {
4011     if (fPeekChar == -1) {
4012         fPeekChar = nextCharLL();
4013     }
4014     return fPeekChar;
4015 }
4016 
4017 
4018 //------------------------------------------------------------------------------
4019 //
4020 //   nextChar     for pattern scanning.  At this level, we handle stripping
4021 //                out comments and processing some backslash character escapes.
4022 //                The rest of the pattern grammar is handled at the next level up.
4023 //
4024 //------------------------------------------------------------------------------
nextChar(RegexPatternChar & c)4025 void RegexCompile::nextChar(RegexPatternChar &c) {
4026   tailRecursion:
4027     fScanIndex = UTEXT_GETNATIVEINDEX(fRXPat->fPattern);
4028     c.fChar    = nextCharLL();
4029     c.fQuoted  = FALSE;
4030 
4031     if (fQuoteMode) {
4032         c.fQuoted = TRUE;
4033         if ((c.fChar==chBackSlash && peekCharLL()==chE && ((fModeFlags & UREGEX_LITERAL) == 0)) ||
4034             c.fChar == (UChar32)-1) {
4035             fQuoteMode = FALSE;  //  Exit quote mode,
4036             nextCharLL();        // discard the E
4037             // nextChar(c);      // recurse to get the real next char
4038             goto tailRecursion;  // Note: fuzz testing produced testcases that
4039                                  //       resulted in stack overflow here.
4040         }
4041     }
4042     else if (fInBackslashQuote) {
4043         // The current character immediately follows a '\'
4044         // Don't check for any further escapes, just return it as-is.
4045         // Don't set c.fQuoted, because that would prevent the state machine from
4046         //    dispatching on the character.
4047         fInBackslashQuote = FALSE;
4048     }
4049     else
4050     {
4051         // We are not in a \Q quoted region \E of the source.
4052         //
4053         if (fModeFlags & UREGEX_COMMENTS) {
4054             //
4055             // We are in free-spacing and comments mode.
4056             //  Scan through any white space and comments, until we
4057             //  reach a significant character or the end of inut.
4058             for (;;) {
4059                 if (c.fChar == (UChar32)-1) {
4060                     break;     // End of Input
4061                 }
4062                 if  (c.fChar == chPound && fEOLComments == TRUE) {
4063                     // Start of a comment.  Consume the rest of it, until EOF or a new line
4064                     for (;;) {
4065                         c.fChar = nextCharLL();
4066                         if (c.fChar == (UChar32)-1 ||  // EOF
4067                             c.fChar == chCR        ||
4068                             c.fChar == chLF        ||
4069                             c.fChar == chNEL       ||
4070                             c.fChar == chLS)       {
4071                             break;
4072                         }
4073                     }
4074                 }
4075                 // TODO:  check what Java & Perl do with non-ASCII white spaces.  Ticket 6061.
4076                 if (PatternProps::isWhiteSpace(c.fChar) == FALSE) {
4077                     break;
4078                 }
4079                 c.fChar = nextCharLL();
4080             }
4081         }
4082 
4083         //
4084         //  check for backslash escaped characters.
4085         //
4086         if (c.fChar == chBackSlash) {
4087             int64_t pos = UTEXT_GETNATIVEINDEX(fRXPat->fPattern);
4088             if (RegexStaticSets::gStaticSets->fUnescapeCharSet.contains(peekCharLL())) {
4089                 //
4090                 // A '\' sequence that is handled by ICU's standard unescapeAt function.
4091                 //   Includes \uxxxx, \n, \r, many others.
4092                 //   Return the single equivalent character.
4093                 //
4094                 nextCharLL();                 // get & discard the peeked char.
4095                 c.fQuoted = TRUE;
4096 
4097                 if (UTEXT_FULL_TEXT_IN_CHUNK(fRXPat->fPattern, fPatternLength)) {
4098                     int32_t endIndex = (int32_t)pos;
4099                     c.fChar = u_unescapeAt(uregex_ucstr_unescape_charAt, &endIndex, (int32_t)fPatternLength, (void *)fRXPat->fPattern->chunkContents);
4100 
4101                     if (endIndex == pos) {
4102                         error(U_REGEX_BAD_ESCAPE_SEQUENCE);
4103                     }
4104                     fCharNum += endIndex - pos;
4105                     UTEXT_SETNATIVEINDEX(fRXPat->fPattern, endIndex);
4106                 } else {
4107                     int32_t offset = 0;
4108                     struct URegexUTextUnescapeCharContext context = U_REGEX_UTEXT_UNESCAPE_CONTEXT(fRXPat->fPattern);
4109 
4110                     UTEXT_SETNATIVEINDEX(fRXPat->fPattern, pos);
4111                     c.fChar = u_unescapeAt(uregex_utext_unescape_charAt, &offset, INT32_MAX, &context);
4112 
4113                     if (offset == 0) {
4114                         error(U_REGEX_BAD_ESCAPE_SEQUENCE);
4115                     } else if (context.lastOffset == offset) {
4116                         UTEXT_PREVIOUS32(fRXPat->fPattern);
4117                     } else if (context.lastOffset != offset-1) {
4118                         utext_moveIndex32(fRXPat->fPattern, offset - context.lastOffset - 1);
4119                     }
4120                     fCharNum += offset;
4121                 }
4122             }
4123             else if (peekCharLL() == chDigit0) {
4124                 //  Octal Escape, using Java Regexp Conventions
4125                 //    which are \0 followed by 1-3 octal digits.
4126                 //    Different from ICU Unescape handling of Octal, which does not
4127                 //    require the leading 0.
4128                 //  Java also has the convention of only consuming 2 octal digits if
4129                 //    the three digit number would be > 0xff
4130                 //
4131                 c.fChar = 0;
4132                 nextCharLL();    // Consume the initial 0.
4133                 int index;
4134                 for (index=0; index<3; index++) {
4135                     int32_t ch = peekCharLL();
4136                     if (ch<chDigit0 || ch>chDigit7) {
4137                         if (index==0) {
4138                            // \0 is not followed by any octal digits.
4139                            error(U_REGEX_BAD_ESCAPE_SEQUENCE);
4140                         }
4141                         break;
4142                     }
4143                     c.fChar <<= 3;
4144                     c.fChar += ch&7;
4145                     if (c.fChar <= 255) {
4146                         nextCharLL();
4147                     } else {
4148                         // The last digit made the number too big.  Forget we saw it.
4149                         c.fChar >>= 3;
4150                     }
4151                 }
4152                 c.fQuoted = TRUE;
4153             }
4154             else if (peekCharLL() == chQ) {
4155                 //  "\Q"  enter quote mode, which will continue until "\E"
4156                 fQuoteMode = TRUE;
4157                 nextCharLL();        // discard the 'Q'.
4158                 // nextChar(c);      // recurse to get the real next char.
4159                 goto tailRecursion;  // Note: fuzz testing produced test cases that
4160                 //                            resulted in stack overflow here.
4161             }
4162             else
4163             {
4164                 // We are in a '\' escape that will be handled by the state table scanner.
4165                 // Just return the backslash, but remember that the following char is to
4166                 //  be taken literally.
4167                 fInBackslashQuote = TRUE;
4168             }
4169         }
4170     }
4171 
4172     // re-enable # to end-of-line comments, in case they were disabled.
4173     // They are disabled by the parser upon seeing '(?', but this lasts for
4174     //  the fetching of the next character only.
4175     fEOLComments = TRUE;
4176 
4177     // putc(c.fChar, stdout);
4178 }
4179 
4180 
4181 
4182 //------------------------------------------------------------------------------
4183 //
4184 //  scanNamedChar
4185 //            Get a UChar32 from a \N{UNICODE CHARACTER NAME} in the pattern.
4186 //
4187 //             The scan position will be at the 'N'.  On return
4188 //             the scan position should be just after the '}'
4189 //
4190 //             Return the UChar32
4191 //
4192 //------------------------------------------------------------------------------
scanNamedChar()4193 UChar32  RegexCompile::scanNamedChar() {
4194     if (U_FAILURE(*fStatus)) {
4195         return 0;
4196     }
4197 
4198     nextChar(fC);
4199     if (fC.fChar != chLBrace) {
4200         error(U_REGEX_PROPERTY_SYNTAX);
4201         return 0;
4202     }
4203 
4204     UnicodeString  charName;
4205     for (;;) {
4206         nextChar(fC);
4207         if (fC.fChar == chRBrace) {
4208             break;
4209         }
4210         if (fC.fChar == -1) {
4211             error(U_REGEX_PROPERTY_SYNTAX);
4212             return 0;
4213         }
4214         charName.append(fC.fChar);
4215     }
4216 
4217     char name[100];
4218     if (!uprv_isInvariantUString(charName.getBuffer(), charName.length()) ||
4219          (uint32_t)charName.length()>=sizeof(name)) {
4220         // All Unicode character names have only invariant characters.
4221         // The API to get a character, given a name, accepts only char *, forcing us to convert,
4222         //   which requires this error check
4223         error(U_REGEX_PROPERTY_SYNTAX);
4224         return 0;
4225     }
4226     charName.extract(0, charName.length(), name, sizeof(name), US_INV);
4227 
4228     UChar32  theChar = u_charFromName(U_UNICODE_CHAR_NAME, name, fStatus);
4229     if (U_FAILURE(*fStatus)) {
4230         error(U_REGEX_PROPERTY_SYNTAX);
4231     }
4232 
4233     nextChar(fC);      // Continue overall regex pattern processing with char after the '}'
4234     return theChar;
4235 }
4236 
4237 //------------------------------------------------------------------------------
4238 //
4239 //  scanProp   Construct a UnicodeSet from the text at the current scan
4240 //             position, which will be of the form \p{whaterver}
4241 //
4242 //             The scan position will be at the 'p' or 'P'.  On return
4243 //             the scan position should be just after the '}'
4244 //
4245 //             Return a UnicodeSet, constructed from the \P pattern,
4246 //             or NULL if the pattern is invalid.
4247 //
4248 //------------------------------------------------------------------------------
scanProp()4249 UnicodeSet *RegexCompile::scanProp() {
4250     UnicodeSet    *uset = NULL;
4251 
4252     if (U_FAILURE(*fStatus)) {
4253         return NULL;
4254     }
4255     (void)chLowerP;   // Suppress compiler unused variable warning.
4256     U_ASSERT(fC.fChar == chLowerP || fC.fChar == chP);
4257     UBool negated = (fC.fChar == chP);
4258 
4259     UnicodeString propertyName;
4260     nextChar(fC);
4261     if (fC.fChar != chLBrace) {
4262         error(U_REGEX_PROPERTY_SYNTAX);
4263         return NULL;
4264     }
4265     for (;;) {
4266         nextChar(fC);
4267         if (fC.fChar == chRBrace) {
4268             break;
4269         }
4270         if (fC.fChar == -1) {
4271             // Hit the end of the input string without finding the closing '}'
4272             error(U_REGEX_PROPERTY_SYNTAX);
4273             return NULL;
4274         }
4275         propertyName.append(fC.fChar);
4276     }
4277     uset = createSetForProperty(propertyName, negated);
4278     nextChar(fC);    // Move input scan to position following the closing '}'
4279     return uset;
4280 }
4281 
4282 //------------------------------------------------------------------------------
4283 //
4284 //  scanPosixProp   Construct a UnicodeSet from the text at the current scan
4285 //             position, which is expected be of the form [:property expression:]
4286 //
4287 //             The scan position will be at the opening ':'.  On return
4288 //             the scan position must be on the closing ']'
4289 //
4290 //             Return a UnicodeSet constructed from the pattern,
4291 //             or NULL if this is not a valid POSIX-style set expression.
4292 //             If not a property expression, restore the initial scan position
4293 //                (to the opening ':')
4294 //
4295 //               Note:  the opening '[:' is not sufficient to guarantee that
4296 //                      this is a [:property:] expression.
4297 //                      [:'+=,] is a perfectly good ordinary set expression that
4298 //                              happens to include ':' as one of its characters.
4299 //
4300 //------------------------------------------------------------------------------
scanPosixProp()4301 UnicodeSet *RegexCompile::scanPosixProp() {
4302     UnicodeSet    *uset = NULL;
4303 
4304     if (U_FAILURE(*fStatus)) {
4305         return NULL;
4306     }
4307 
4308     U_ASSERT(fC.fChar == chColon);
4309 
4310     // Save the scanner state.
4311     // TODO:  move this into the scanner, with the state encapsulated in some way.  Ticket 6062
4312     int64_t     savedScanIndex        = fScanIndex;
4313     int64_t     savedNextIndex        = UTEXT_GETNATIVEINDEX(fRXPat->fPattern);
4314     UBool       savedQuoteMode        = fQuoteMode;
4315     UBool       savedInBackslashQuote = fInBackslashQuote;
4316     UBool       savedEOLComments      = fEOLComments;
4317     int64_t     savedLineNum          = fLineNum;
4318     int64_t     savedCharNum          = fCharNum;
4319     UChar32     savedLastChar         = fLastChar;
4320     UChar32     savedPeekChar         = fPeekChar;
4321     RegexPatternChar savedfC          = fC;
4322 
4323     // Scan for a closing ].   A little tricky because there are some perverse
4324     //   edge cases possible.  "[:abc\Qdef:] \E]"  is a valid non-property expression,
4325     //   ending on the second closing ].
4326 
4327     UnicodeString propName;
4328     UBool         negated  = FALSE;
4329 
4330     // Check for and consume the '^' in a negated POSIX property, e.g.  [:^Letter:]
4331     nextChar(fC);
4332     if (fC.fChar == chUp) {
4333        negated = TRUE;
4334        nextChar(fC);
4335     }
4336 
4337     // Scan for the closing ":]", collecting the property name along the way.
4338     UBool  sawPropSetTerminator = FALSE;
4339     for (;;) {
4340         propName.append(fC.fChar);
4341         nextChar(fC);
4342         if (fC.fQuoted || fC.fChar == -1) {
4343             // Escaped characters or end of input - either says this isn't a [:Property:]
4344             break;
4345         }
4346         if (fC.fChar == chColon) {
4347             nextChar(fC);
4348             if (fC.fChar == chRBracket) {
4349                 sawPropSetTerminator = TRUE;
4350             }
4351             break;
4352         }
4353     }
4354 
4355     if (sawPropSetTerminator) {
4356         uset = createSetForProperty(propName, negated);
4357     }
4358     else
4359     {
4360         // No closing ":]".
4361         //  Restore the original scan position.
4362         //  The main scanner will retry the input as a normal set expression,
4363         //    not a [:Property:] expression.
4364         fScanIndex        = savedScanIndex;
4365         fQuoteMode        = savedQuoteMode;
4366         fInBackslashQuote = savedInBackslashQuote;
4367         fEOLComments      = savedEOLComments;
4368         fLineNum          = savedLineNum;
4369         fCharNum          = savedCharNum;
4370         fLastChar         = savedLastChar;
4371         fPeekChar         = savedPeekChar;
4372         fC                = savedfC;
4373         UTEXT_SETNATIVEINDEX(fRXPat->fPattern, savedNextIndex);
4374     }
4375     return uset;
4376 }
4377 
addIdentifierIgnorable(UnicodeSet * set,UErrorCode & ec)4378 static inline void addIdentifierIgnorable(UnicodeSet *set, UErrorCode& ec) {
4379     set->add(0, 8).add(0x0e, 0x1b).add(0x7f, 0x9f);
4380     addCategory(set, U_GC_CF_MASK, ec);
4381 }
4382 
4383 //
4384 //  Create a Unicode Set from a Unicode Property expression.
4385 //     This is common code underlying both \p{...} ane [:...:] expressions.
4386 //     Includes trying the Java "properties" that aren't supported as
4387 //     normal ICU UnicodeSet properties
4388 //
createSetForProperty(const UnicodeString & propName,UBool negated)4389 UnicodeSet *RegexCompile::createSetForProperty(const UnicodeString &propName, UBool negated) {
4390 
4391     if (U_FAILURE(*fStatus)) {
4392         return nullptr;
4393     }
4394     LocalPointer<UnicodeSet> set;
4395     UErrorCode status = U_ZERO_ERROR;
4396 
4397     do {      // non-loop, exists to allow breaks from the block.
4398         //
4399         //  First try the property as we received it
4400         //
4401         UnicodeString   setExpr;
4402         uint32_t        usetFlags = 0;
4403         setExpr.append(u"[\\p{", -1);
4404         setExpr.append(propName);
4405         setExpr.append(u"}]", -1);
4406         if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
4407             usetFlags |= USET_CASE_INSENSITIVE;
4408         }
4409         set.adoptInsteadAndCheckErrorCode(new UnicodeSet(setExpr, usetFlags, NULL, status), status);
4410         if (U_SUCCESS(status) || status == U_MEMORY_ALLOCATION_ERROR) {
4411             break;
4412         }
4413 
4414         //
4415         //  The incoming property wasn't directly recognized by ICU.
4416 
4417         //  Check [:word:] and [:all:]. These are not recognized as a properties by ICU UnicodeSet.
4418         //     Java accepts 'word' with mixed case.
4419         //     Java accepts 'all' only in all lower case.
4420 
4421         status = U_ZERO_ERROR;
4422         if (propName.caseCompare(u"word", -1, 0) == 0) {
4423             set.adoptInsteadAndCheckErrorCode(
4424                 RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET].cloneAsThawed(), status);
4425             break;
4426         }
4427         if (propName.compare(u"all", -1) == 0) {
4428             set.adoptInsteadAndCheckErrorCode(new UnicodeSet(0, 0x10ffff), status);
4429             break;
4430         }
4431 
4432 
4433         //    Do Java InBlock expressions
4434         //
4435         UnicodeString mPropName = propName;
4436         if (mPropName.startsWith(u"In", 2) && mPropName.length() >= 3) {
4437             status = U_ZERO_ERROR;
4438             set.adoptInsteadAndCheckErrorCode(new UnicodeSet(), status);
4439             if (U_FAILURE(status)) {
4440                 break;
4441             }
4442             UnicodeString blockName(mPropName, 2);  // Property with the leading "In" removed.
4443             set->applyPropertyAlias(UnicodeString(u"Block"), blockName, status);
4444             break;
4445         }
4446 
4447         //  Check for the Java form "IsBooleanPropertyValue", which we will recast
4448         //  as "BooleanPropertyValue". The property value can be either a
4449         //  a General Category or a Script Name.
4450 
4451         if (propName.startsWith(u"Is", 2) && propName.length()>=3) {
4452             mPropName.remove(0, 2);      // Strip the "Is"
4453             if (mPropName.indexOf(u'=') >= 0) {
4454                 // Reject any "Is..." property expression containing an '=', that is,
4455                 // any non-binary property expression.
4456                 status = U_REGEX_PROPERTY_SYNTAX;
4457                 break;
4458             }
4459 
4460             if (mPropName.caseCompare(u"assigned", -1, 0) == 0) {
4461                 mPropName.setTo(u"unassigned", -1);
4462                 negated = !negated;
4463             } else if (mPropName.caseCompare(u"TitleCase", -1, 0) == 0) {
4464                 mPropName.setTo(u"Titlecase_Letter", -1);
4465             }
4466 
4467             mPropName.insert(0, u"[\\p{", -1);
4468             mPropName.append(u"}]", -1);
4469             set.adoptInsteadAndCheckErrorCode(new UnicodeSet(mPropName, *fStatus), status);
4470 
4471             if (U_SUCCESS(status) && !set->isEmpty() && (usetFlags & USET_CASE_INSENSITIVE)) {
4472                 set->closeOver(USET_CASE_INSENSITIVE);
4473             }
4474             break;
4475 
4476         }
4477 
4478         if (propName.startsWith(u"java", -1)) {
4479             status = U_ZERO_ERROR;
4480             set.adoptInsteadAndCheckErrorCode(new UnicodeSet(), status);
4481             if (U_FAILURE(status)) {
4482                 break;
4483             }
4484             //
4485             //  Try the various Java specific properties.
4486             //   These all begin with "java"
4487             //
4488             if (propName.compare(u"javaDefined", -1) == 0) {
4489                 addCategory(set.getAlias(), U_GC_CN_MASK, status);
4490                 set->complement();
4491             }
4492             else if (propName.compare(u"javaDigit", -1) == 0) {
4493                 addCategory(set.getAlias(), U_GC_ND_MASK, status);
4494             }
4495             else if (propName.compare(u"javaIdentifierIgnorable", -1) == 0) {
4496                 addIdentifierIgnorable(set.getAlias(), status);
4497             }
4498             else if (propName.compare(u"javaISOControl", -1) == 0) {
4499                 set->add(0, 0x1F).add(0x7F, 0x9F);
4500             }
4501             else if (propName.compare(u"javaJavaIdentifierPart", -1) == 0) {
4502                 addCategory(set.getAlias(), U_GC_L_MASK, status);
4503                 addCategory(set.getAlias(), U_GC_SC_MASK, status);
4504                 addCategory(set.getAlias(), U_GC_PC_MASK, status);
4505                 addCategory(set.getAlias(), U_GC_ND_MASK, status);
4506                 addCategory(set.getAlias(), U_GC_NL_MASK, status);
4507                 addCategory(set.getAlias(), U_GC_MC_MASK, status);
4508                 addCategory(set.getAlias(), U_GC_MN_MASK, status);
4509                 addIdentifierIgnorable(set.getAlias(), status);
4510             }
4511             else if (propName.compare(u"javaJavaIdentifierStart", -1) == 0) {
4512                 addCategory(set.getAlias(), U_GC_L_MASK, status);
4513                 addCategory(set.getAlias(), U_GC_NL_MASK, status);
4514                 addCategory(set.getAlias(), U_GC_SC_MASK, status);
4515                 addCategory(set.getAlias(), U_GC_PC_MASK, status);
4516             }
4517             else if (propName.compare(u"javaLetter", -1) == 0) {
4518                 addCategory(set.getAlias(), U_GC_L_MASK, status);
4519             }
4520             else if (propName.compare(u"javaLetterOrDigit", -1) == 0) {
4521                 addCategory(set.getAlias(), U_GC_L_MASK, status);
4522                 addCategory(set.getAlias(), U_GC_ND_MASK, status);
4523             }
4524             else if (propName.compare(u"javaLowerCase", -1) == 0) {
4525                 addCategory(set.getAlias(), U_GC_LL_MASK, status);
4526             }
4527             else if (propName.compare(u"javaMirrored", -1) == 0) {
4528                 set->applyIntPropertyValue(UCHAR_BIDI_MIRRORED, 1, status);
4529             }
4530             else if (propName.compare(u"javaSpaceChar", -1) == 0) {
4531                 addCategory(set.getAlias(), U_GC_Z_MASK, status);
4532             }
4533             else if (propName.compare(u"javaSupplementaryCodePoint", -1) == 0) {
4534                 set->add(0x10000, UnicodeSet::MAX_VALUE);
4535             }
4536             else if (propName.compare(u"javaTitleCase", -1) == 0) {
4537                 addCategory(set.getAlias(), U_GC_LT_MASK, status);
4538             }
4539             else if (propName.compare(u"javaUnicodeIdentifierStart", -1) == 0) {
4540                 addCategory(set.getAlias(), U_GC_L_MASK, status);
4541                 addCategory(set.getAlias(), U_GC_NL_MASK, status);
4542             }
4543             else if (propName.compare(u"javaUnicodeIdentifierPart", -1) == 0) {
4544                 addCategory(set.getAlias(), U_GC_L_MASK, status);
4545                 addCategory(set.getAlias(), U_GC_PC_MASK, status);
4546                 addCategory(set.getAlias(), U_GC_ND_MASK, status);
4547                 addCategory(set.getAlias(), U_GC_NL_MASK, status);
4548                 addCategory(set.getAlias(), U_GC_MC_MASK, status);
4549                 addCategory(set.getAlias(), U_GC_MN_MASK, status);
4550                 addIdentifierIgnorable(set.getAlias(), status);
4551             }
4552             else if (propName.compare(u"javaUpperCase", -1) == 0) {
4553                 addCategory(set.getAlias(), U_GC_LU_MASK, status);
4554             }
4555             else if (propName.compare(u"javaValidCodePoint", -1) == 0) {
4556                 set->add(0, UnicodeSet::MAX_VALUE);
4557             }
4558             else if (propName.compare(u"javaWhitespace", -1) == 0) {
4559                 addCategory(set.getAlias(), U_GC_Z_MASK, status);
4560                 set->removeAll(UnicodeSet().add(0xa0).add(0x2007).add(0x202f));
4561                 set->add(9, 0x0d).add(0x1c, 0x1f);
4562             } else {
4563                 status = U_REGEX_PROPERTY_SYNTAX;
4564             }
4565 
4566             if (U_SUCCESS(status) && !set->isEmpty() && (usetFlags & USET_CASE_INSENSITIVE)) {
4567                 set->closeOver(USET_CASE_INSENSITIVE);
4568             }
4569             break;
4570         }
4571 
4572         // Unrecognized property. ICU didn't like it as it was, and none of the Java compatibility
4573         // extensions matched it.
4574         status = U_REGEX_PROPERTY_SYNTAX;
4575     } while (false);   // End of do loop block. Code above breaks out of the block on success or hard failure.
4576 
4577     if (U_SUCCESS(status)) {
4578         U_ASSERT(set.isValid());
4579         if (negated) {
4580             set->complement();
4581         }
4582         return set.orphan();
4583     } else {
4584         if (status == U_ILLEGAL_ARGUMENT_ERROR) {
4585             status = U_REGEX_PROPERTY_SYNTAX;
4586         }
4587         error(status);
4588         return nullptr;
4589     }
4590 }
4591 
4592 
4593 //
4594 //  SetEval   Part of the evaluation of [set expressions].
4595 //            Perform any pending (stacked) operations with precedence
4596 //            equal or greater to that of the next operator encountered
4597 //            in the expression.
4598 //
setEval(int32_t nextOp)4599 void RegexCompile::setEval(int32_t nextOp) {
4600     UnicodeSet *rightOperand = NULL;
4601     UnicodeSet *leftOperand  = NULL;
4602     for (;;) {
4603         U_ASSERT(fSetOpStack.empty()==FALSE);
4604         int32_t pendingSetOperation = fSetOpStack.peeki();
4605         if ((pendingSetOperation&0xffff0000) < (nextOp&0xffff0000)) {
4606             break;
4607         }
4608         fSetOpStack.popi();
4609         U_ASSERT(fSetStack.empty() == FALSE);
4610         rightOperand = (UnicodeSet *)fSetStack.peek();
4611         switch (pendingSetOperation) {
4612             case setNegation:
4613                 rightOperand->complement();
4614                 break;
4615             case setCaseClose:
4616                 // TODO: need a simple close function.  Ticket 6065
4617                 rightOperand->closeOver(USET_CASE_INSENSITIVE);
4618                 rightOperand->removeAllStrings();
4619                 break;
4620             case setDifference1:
4621             case setDifference2:
4622                 fSetStack.pop();
4623                 leftOperand = (UnicodeSet *)fSetStack.peek();
4624                 leftOperand->removeAll(*rightOperand);
4625                 delete rightOperand;
4626                 break;
4627             case setIntersection1:
4628             case setIntersection2:
4629                 fSetStack.pop();
4630                 leftOperand = (UnicodeSet *)fSetStack.peek();
4631                 leftOperand->retainAll(*rightOperand);
4632                 delete rightOperand;
4633                 break;
4634             case setUnion:
4635                 fSetStack.pop();
4636                 leftOperand = (UnicodeSet *)fSetStack.peek();
4637                 leftOperand->addAll(*rightOperand);
4638                 delete rightOperand;
4639                 break;
4640             default:
4641                 UPRV_UNREACHABLE;
4642             }
4643         }
4644     }
4645 
setPushOp(int32_t op)4646 void RegexCompile::setPushOp(int32_t op) {
4647     setEval(op);
4648     fSetOpStack.push(op, *fStatus);
4649     fSetStack.push(new UnicodeSet(), *fStatus);
4650 }
4651 
4652 U_NAMESPACE_END
4653 #endif  // !UCONFIG_NO_REGULAR_EXPRESSIONS
4654 
4655