• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 //
4 //  file:  regexcmp.cpp
5 //
6 //  Copyright (C) 2002-2016 International Business Machines Corporation and others.
7 //  All Rights Reserved.
8 //
9 //  This file contains the ICU regular expression compiler, which is responsible
10 //  for processing a regular expression pattern into the compiled form that
11 //  is used by the match finding engine.
12 //
13 
14 #include "unicode/utypes.h"
15 
16 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
17 
18 #include "unicode/ustring.h"
19 #include "unicode/unistr.h"
20 #include "unicode/uniset.h"
21 #include "unicode/uchar.h"
22 #include "unicode/uchriter.h"
23 #include "unicode/parsepos.h"
24 #include "unicode/parseerr.h"
25 #include "unicode/regex.h"
26 #include "unicode/utf.h"
27 #include "unicode/utf16.h"
28 #include "patternprops.h"
29 #include "putilimp.h"
30 #include "cmemory.h"
31 #include "cstr.h"
32 #include "cstring.h"
33 #include "uvectr32.h"
34 #include "uvectr64.h"
35 #include "uassert.h"
36 #include "uinvchar.h"
37 
38 #include "regeximp.h"
39 #include "regexcst.h"   // Contains state table for the regex pattern parser.
40                         //   generated by a Perl script.
41 #include "regexcmp.h"
42 #include "regexst.h"
43 #include "regextxt.h"
44 
45 
46 
47 U_NAMESPACE_BEGIN
48 
49 
50 //------------------------------------------------------------------------------
51 //
52 //  Constructor.
53 //
54 //------------------------------------------------------------------------------
RegexCompile(RegexPattern * rxp,UErrorCode & status)55 RegexCompile::RegexCompile(RegexPattern *rxp, UErrorCode &status) :
56    fParenStack(status), fSetStack(uprv_deleteUObject, nullptr, status), fSetOpStack(status)
57 {
58     // Lazy init of all shared global sets (needed for init()'s empty text)
59     RegexStaticSets::initGlobals(&status);
60 
61     fStatus           = &status;
62 
63     fRXPat            = rxp;
64     fScanIndex        = 0;
65     fLastChar         = -1;
66     fPeekChar         = -1;
67     fLineNum          = 1;
68     fCharNum          = 0;
69     fQuoteMode        = FALSE;
70     fInBackslashQuote = FALSE;
71     fModeFlags        = fRXPat->fFlags | 0x80000000;
72     fEOLComments      = TRUE;
73 
74     fMatchOpenParen   = -1;
75     fMatchCloseParen  = -1;
76     fCaptureName      = NULL;
77     fLastSetLiteral   = U_SENTINEL;
78 
79     if (U_SUCCESS(status) && U_FAILURE(rxp->fDeferredStatus)) {
80         status = rxp->fDeferredStatus;
81     }
82 }
83 
84 static const UChar      chAmp       = 0x26;      // '&'
85 static const UChar      chDash      = 0x2d;      // '-'
86 
87 
88 //------------------------------------------------------------------------------
89 //
90 //  Destructor
91 //
92 //------------------------------------------------------------------------------
~RegexCompile()93 RegexCompile::~RegexCompile() {
94     delete fCaptureName;         // Normally will be NULL, but can exist if pattern
95                                  //   compilation stops with a syntax error.
96 }
97 
addCategory(UnicodeSet * set,int32_t value,UErrorCode & ec)98 static inline void addCategory(UnicodeSet *set, int32_t value, UErrorCode& ec) {
99     set->addAll(UnicodeSet().applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, value, ec));
100 }
101 
102 //------------------------------------------------------------------------------
103 //
104 //  Compile regex pattern.   The state machine for rexexp pattern parsing is here.
105 //                           The state tables are hand-written in the file regexcst.txt,
106 //                           and converted to the form used here by a perl
107 //                           script regexcst.pl
108 //
109 //------------------------------------------------------------------------------
compile(const UnicodeString & pat,UParseError & pp,UErrorCode & e)110 void    RegexCompile::compile(
111                          const UnicodeString &pat,   // Source pat to be compiled.
112                          UParseError &pp,            // Error position info
113                          UErrorCode &e)              // Error Code
114 {
115     fRXPat->fPatternString = new UnicodeString(pat);
116     UText patternText = UTEXT_INITIALIZER;
117     utext_openConstUnicodeString(&patternText, fRXPat->fPatternString, &e);
118 
119     if (U_SUCCESS(e)) {
120         compile(&patternText, pp, e);
121         utext_close(&patternText);
122     }
123 }
124 
125 //
126 //   compile, UText mode
127 //     All the work is actually done here.
128 //
compile(UText * pat,UParseError & pp,UErrorCode & e)129 void    RegexCompile::compile(
130                          UText *pat,                 // Source pat to be compiled.
131                          UParseError &pp,            // Error position info
132                          UErrorCode &e)              // Error Code
133 {
134     fStatus             = &e;
135     fParseErr           = &pp;
136     fStackPtr           = 0;
137     fStack[fStackPtr]   = 0;
138 
139     if (U_FAILURE(*fStatus)) {
140         return;
141     }
142 
143     // There should be no pattern stuff in the RegexPattern object.  They can not be reused.
144     U_ASSERT(fRXPat->fPattern == NULL || utext_nativeLength(fRXPat->fPattern) == 0);
145 
146     // Prepare the RegexPattern object to receive the compiled pattern.
147     fRXPat->fPattern        = utext_clone(fRXPat->fPattern, pat, FALSE, TRUE, fStatus);
148     if (U_FAILURE(*fStatus)) {
149         return;
150     }
151 
152     // Initialize the pattern scanning state machine
153     fPatternLength = utext_nativeLength(pat);
154     uint16_t                state = 1;
155     const RegexTableEl      *tableEl;
156 
157     // UREGEX_LITERAL force entire pattern to be treated as a literal string.
158     if (fModeFlags & UREGEX_LITERAL) {
159         fQuoteMode = TRUE;
160     }
161 
162     nextChar(fC);                        // Fetch the first char from the pattern string.
163 
164     //
165     // Main loop for the regex pattern parsing state machine.
166     //   Runs once per state transition.
167     //   Each time through optionally performs, depending on the state table,
168     //      - an advance to the the next pattern char
169     //      - an action to be performed.
170     //      - pushing or popping a state to/from the local state return stack.
171     //   file regexcst.txt is the source for the state table.  The logic behind
172     //     recongizing the pattern syntax is there, not here.
173     //
174     for (;;) {
175         //  Bail out if anything has gone wrong.
176         //  Regex pattern parsing stops on the first error encountered.
177         if (U_FAILURE(*fStatus)) {
178             break;
179         }
180 
181         U_ASSERT(state != 0);
182 
183         // Find the state table element that matches the input char from the pattern, or the
184         //    class of the input character.  Start with the first table row for this
185         //    state, then linearly scan forward until we find a row that matches the
186         //    character.  The last row for each state always matches all characters, so
187         //    the search will stop there, if not before.
188         //
189         tableEl = &gRuleParseStateTable[state];
190         REGEX_SCAN_DEBUG_PRINTF(("char, line, col = (\'%c\', %d, %d)    state=%s ",
191             fC.fChar, fLineNum, fCharNum, RegexStateNames[state]));
192 
193         for (;;) {    // loop through table rows belonging to this state, looking for one
194                       //   that matches the current input char.
195             REGEX_SCAN_DEBUG_PRINTF(("."));
196             if (tableEl->fCharClass < 127 && fC.fQuoted == FALSE &&   tableEl->fCharClass == fC.fChar) {
197                 // Table row specified an individual character, not a set, and
198                 //   the input character is not quoted, and
199                 //   the input character matched it.
200                 break;
201             }
202             if (tableEl->fCharClass == 255) {
203                 // Table row specified default, match anything character class.
204                 break;
205             }
206             if (tableEl->fCharClass == 254 && fC.fQuoted)  {
207                 // Table row specified "quoted" and the char was quoted.
208                 break;
209             }
210             if (tableEl->fCharClass == 253 && fC.fChar == (UChar32)-1)  {
211                 // Table row specified eof and we hit eof on the input.
212                 break;
213             }
214 
215             if (tableEl->fCharClass >= 128 && tableEl->fCharClass < 240 &&   // Table specs a char class &&
216                 fC.fQuoted == FALSE &&                                       //   char is not escaped &&
217                 fC.fChar != (UChar32)-1) {                                   //   char is not EOF
218                 U_ASSERT(tableEl->fCharClass <= 137);
219                 if (RegexStaticSets::gStaticSets->fRuleSets[tableEl->fCharClass-128].contains(fC.fChar)) {
220                     // Table row specified a character class, or set of characters,
221                     //   and the current char matches it.
222                     break;
223                 }
224             }
225 
226             // No match on this row, advance to the next  row for this state,
227             tableEl++;
228         }
229         REGEX_SCAN_DEBUG_PRINTF(("\n"));
230 
231         //
232         // We've found the row of the state table that matches the current input
233         //   character from the rules string.
234         // Perform any action specified  by this row in the state table.
235         if (doParseActions(tableEl->fAction) == FALSE) {
236             // Break out of the state machine loop if the
237             //   the action signalled some kind of error, or
238             //   the action was to exit, occurs on normal end-of-rules-input.
239             break;
240         }
241 
242         if (tableEl->fPushState != 0) {
243             fStackPtr++;
244             if (fStackPtr >= kStackSize) {
245                 error(U_REGEX_INTERNAL_ERROR);
246                 REGEX_SCAN_DEBUG_PRINTF(("RegexCompile::parse() - state stack overflow.\n"));
247                 fStackPtr--;
248             }
249             fStack[fStackPtr] = tableEl->fPushState;
250         }
251 
252         //
253         //  NextChar.  This is where characters are actually fetched from the pattern.
254         //             Happens under control of the 'n' tag in the state table.
255         //
256         if (tableEl->fNextChar) {
257             nextChar(fC);
258         }
259 
260         // Get the next state from the table entry, or from the
261         //   state stack if the next state was specified as "pop".
262         if (tableEl->fNextState != 255) {
263             state = tableEl->fNextState;
264         } else {
265             state = fStack[fStackPtr];
266             fStackPtr--;
267             if (fStackPtr < 0) {
268                 // state stack underflow
269                 // This will occur if the user pattern has mis-matched parentheses,
270                 //   with extra close parens.
271                 //
272                 fStackPtr++;
273                 error(U_REGEX_MISMATCHED_PAREN);
274             }
275         }
276 
277     }
278 
279     if (U_FAILURE(*fStatus)) {
280         // Bail out if the pattern had errors.
281         return;
282     }
283 
284     //
285     // The pattern has now been read and processed, and the compiled code generated.
286     //
287 
288     //
289     // The pattern's fFrameSize so far has accumulated the requirements for
290     //   storage for capture parentheses, counters, etc. that are encountered
291     //   in the pattern.  Add space for the two variables that are always
292     //   present in the saved state:  the input string position (int64_t) and
293     //   the position in the compiled pattern.
294     //
295     allocateStackData(RESTACKFRAME_HDRCOUNT);
296 
297     //
298     // Optimization pass 1: NOPs, back-references, and case-folding
299     //
300     stripNOPs();
301 
302     //
303     // Get bounds for the minimum and maximum length of a string that this
304     //   pattern can match.  Used to avoid looking for matches in strings that
305     //   are too short.
306     //
307     fRXPat->fMinMatchLen = minMatchLength(3, fRXPat->fCompiledPat->size()-1);
308 
309     //
310     // Optimization pass 2: match start type
311     //
312     matchStartType();
313 
314     //
315     // Set up fast latin-1 range sets
316     //
317     int32_t numSets = fRXPat->fSets->size();
318     fRXPat->fSets8 = new Regex8BitSet[numSets];
319     // Null pointer check.
320     if (fRXPat->fSets8 == NULL) {
321         e = *fStatus = U_MEMORY_ALLOCATION_ERROR;
322         return;
323     }
324     int32_t i;
325     for (i=0; i<numSets; i++) {
326         UnicodeSet *s = (UnicodeSet *)fRXPat->fSets->elementAt(i);
327         fRXPat->fSets8[i].init(s);
328     }
329 
330 }
331 
332 
333 
334 
335 
336 //------------------------------------------------------------------------------
337 //
338 //  doParseAction        Do some action during regex pattern parsing.
339 //                       Called by the parse state machine.
340 //
341 //                       Generation of the match engine PCode happens here, or
342 //                       in functions called from the parse actions defined here.
343 //
344 //
345 //------------------------------------------------------------------------------
doParseActions(int32_t action)346 UBool RegexCompile::doParseActions(int32_t action)
347 {
348     UBool   returnVal = TRUE;
349 
350     switch ((Regex_PatternParseAction)action) {
351 
352     case doPatStart:
353         // Start of pattern compiles to:
354         //0   SAVE   2        Fall back to position of FAIL
355         //1   jmp    3
356         //2   FAIL            Stop if we ever reach here.
357         //3   NOP             Dummy, so start of pattern looks the same as
358         //                    the start of an ( grouping.
359         //4   NOP             Resreved, will be replaced by a save if there are
360         //                    OR | operators at the top level
361         appendOp(URX_STATE_SAVE, 2);
362         appendOp(URX_JMP,  3);
363         appendOp(URX_FAIL, 0);
364 
365         // Standard open nonCapture paren action emits the two NOPs and
366         //   sets up the paren stack frame.
367         doParseActions(doOpenNonCaptureParen);
368         break;
369 
370     case doPatFinish:
371         // We've scanned to the end of the pattern
372         //  The end of pattern compiles to:
373         //        URX_END
374         //    which will stop the runtime match engine.
375         //  Encountering end of pattern also behaves like a close paren,
376         //   and forces fixups of the State Save at the beginning of the compiled pattern
377         //   and of any OR operations at the top level.
378         //
379         handleCloseParen();
380         if (fParenStack.size() > 0) {
381             // Missing close paren in pattern.
382             error(U_REGEX_MISMATCHED_PAREN);
383         }
384 
385         // add the END operation to the compiled pattern.
386         appendOp(URX_END, 0);
387 
388         // Terminate the pattern compilation state machine.
389         returnVal = FALSE;
390         break;
391 
392 
393 
394     case doOrOperator:
395         // Scanning a '|', as in (A|B)
396         {
397             // Generate code for any pending literals preceding the '|'
398             fixLiterals(FALSE);
399 
400             // Insert a SAVE operation at the start of the pattern section preceding
401             //   this OR at this level.  This SAVE will branch the match forward
402             //   to the right hand side of the OR in the event that the left hand
403             //   side fails to match and backtracks.  Locate the position for the
404             //   save from the location on the top of the parentheses stack.
405             int32_t savePosition = fParenStack.popi();
406             int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(savePosition);
407             U_ASSERT(URX_TYPE(op) == URX_NOP);  // original contents of reserved location
408             op = buildOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+1);
409             fRXPat->fCompiledPat->setElementAt(op, savePosition);
410 
411             // Append an JMP operation into the compiled pattern.  The operand for
412             //  the JMP will eventually be the location following the ')' for the
413             //  group.  This will be patched in later, when the ')' is encountered.
414             appendOp(URX_JMP, 0);
415 
416             // Push the position of the newly added JMP op onto the parentheses stack.
417             // This registers if for fixup when this block's close paren is encountered.
418             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);
419 
420             // Append a NOP to the compiled pattern.  This is the slot reserved
421             //   for a SAVE in the event that there is yet another '|' following
422             //   this one.
423             appendOp(URX_NOP, 0);
424             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);
425         }
426         break;
427 
428 
429     case doBeginNamedCapture:
430         // Scanning (?<letter.
431         //   The first letter of the name will come through again under doConinueNamedCapture.
432         fCaptureName = new UnicodeString();
433         if (fCaptureName == NULL) {
434             error(U_MEMORY_ALLOCATION_ERROR);
435         }
436         break;
437 
438     case  doContinueNamedCapture:
439         fCaptureName->append(fC.fChar);
440         break;
441 
442     case doBadNamedCapture:
443         error(U_REGEX_INVALID_CAPTURE_GROUP_NAME);
444         break;
445 
446     case doOpenCaptureParen:
447         // Open Capturing Paren, possibly named.
448         //   Compile to a
449         //      - NOP, which later may be replaced by a save-state if the
450         //         parenthesized group gets a * quantifier, followed by
451         //      - START_CAPTURE  n    where n is stack frame offset to the capture group variables.
452         //      - NOP, which may later be replaced by a save-state if there
453         //             is an '|' alternation within the parens.
454         //
455         //    Each capture group gets three slots in the save stack frame:
456         //         0: Capture Group start position (in input string being matched.)
457         //         1: Capture Group end position.
458         //         2: Start of Match-in-progress.
459         //    The first two locations are for a completed capture group, and are
460         //     referred to by back references and the like.
461         //    The third location stores the capture start position when an START_CAPTURE is
462         //      encountered.  This will be promoted to a completed capture when (and if) the corresponding
463         //      END_CAPTURE is encountered.
464         {
465             fixLiterals();
466             appendOp(URX_NOP, 0);
467             int32_t  varsLoc = allocateStackData(3);    // Reserve three slots in match stack frame.
468             appendOp(URX_START_CAPTURE, varsLoc);
469             appendOp(URX_NOP, 0);
470 
471             // On the Parentheses stack, start a new frame and add the positions
472             //   of the two NOPs.  Depending on what follows in the pattern, the
473             //   NOPs may be changed to SAVE_STATE or JMP ops, with a target
474             //   address of the end of the parenthesized group.
475             fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
476             fParenStack.push(capturing, *fStatus);                        // Frame type.
477             fParenStack.push(fRXPat->fCompiledPat->size()-3, *fStatus);   // The first  NOP location
478             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP loc
479 
480             // Save the mapping from group number to stack frame variable position.
481             fRXPat->fGroupMap->addElement(varsLoc, *fStatus);
482 
483             // If this is a named capture group, add the name->group number mapping.
484             if (fCaptureName != NULL) {
485                 if (!fRXPat->initNamedCaptureMap()) {
486                     if (U_SUCCESS(*fStatus)) {
487                         error(fRXPat->fDeferredStatus);
488                     }
489                     break;
490                 }
491                 int32_t groupNumber = fRXPat->fGroupMap->size();
492                 int32_t previousMapping = uhash_puti(fRXPat->fNamedCaptureMap, fCaptureName, groupNumber, fStatus);
493                 fCaptureName = NULL;    // hash table takes ownership of the name (key) string.
494                 if (previousMapping > 0 && U_SUCCESS(*fStatus)) {
495                     error(U_REGEX_INVALID_CAPTURE_GROUP_NAME);
496                 }
497             }
498         }
499         break;
500 
501     case doOpenNonCaptureParen:
502         // Open non-caputuring (grouping only) Paren.
503         //   Compile to a
504         //      - NOP, which later may be replaced by a save-state if the
505         //         parenthesized group gets a * quantifier, followed by
506         //      - NOP, which may later be replaced by a save-state if there
507         //             is an '|' alternation within the parens.
508         {
509             fixLiterals();
510             appendOp(URX_NOP, 0);
511             appendOp(URX_NOP, 0);
512 
513             // On the Parentheses stack, start a new frame and add the positions
514             //   of the two NOPs.
515             fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
516             fParenStack.push(plain,      *fStatus);                       // Begin a new frame.
517             fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The first  NOP location
518             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP loc
519         }
520          break;
521 
522 
523     case doOpenAtomicParen:
524         // Open Atomic Paren.  (?>
525         //   Compile to a
526         //      - NOP, which later may be replaced if the parenthesized group
527         //         has a quantifier, followed by
528         //      - STO_SP  save state stack position, so it can be restored at the ")"
529         //      - NOP, which may later be replaced by a save-state if there
530         //             is an '|' alternation within the parens.
531         {
532             fixLiterals();
533             appendOp(URX_NOP, 0);
534             int32_t  varLoc = allocateData(1);    // Reserve a data location for saving the state stack ptr.
535             appendOp(URX_STO_SP, varLoc);
536             appendOp(URX_NOP, 0);
537 
538             // On the Parentheses stack, start a new frame and add the positions
539             //   of the two NOPs.  Depending on what follows in the pattern, the
540             //   NOPs may be changed to SAVE_STATE or JMP ops, with a target
541             //   address of the end of the parenthesized group.
542             fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
543             fParenStack.push(atomic, *fStatus);                           // Frame type.
544             fParenStack.push(fRXPat->fCompiledPat->size()-3, *fStatus);   // The first NOP
545             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP
546         }
547         break;
548 
549 
550     case doOpenLookAhead:
551         // Positive Look-ahead   (?=  stuff  )
552         //
553         //   Note:   Addition of transparent input regions, with the need to
554         //           restore the original regions when failing out of a lookahead
555         //           block, complicated this sequence.  Some combined opcodes
556         //           might make sense - or might not, lookahead aren't that common.
557         //
558         //      Caution:  min match length optimization knows about this
559         //               sequence; don't change without making updates there too.
560         //
561         // Compiles to
562         //    1    LA_START     dataLoc     Saves SP, Input Pos, Active input region.
563         //    2.   STATE_SAVE   4            on failure of lookahead, goto 4
564         //    3    JMP          6           continue ...
565         //
566         //    4.   LA_END                   Look Ahead failed.  Restore regions.
567         //    5.   BACKTRACK                and back track again.
568         //
569         //    6.   NOP              reserved for use by quantifiers on the block.
570         //                          Look-ahead can't have quantifiers, but paren stack
571         //                             compile time conventions require the slot anyhow.
572         //    7.   NOP              may be replaced if there is are '|' ops in the block.
573         //    8.     code for parenthesized stuff.
574         //    9.   LA_END
575         //
576         //  Four data slots are reserved, for saving state on entry to the look-around
577         //    0:   stack pointer on entry.
578         //    1:   input position on entry.
579         //    2:   fActiveStart, the active bounds start on entry.
580         //    3:   fActiveLimit, the active bounds limit on entry.
581         {
582             fixLiterals();
583             int32_t dataLoc = allocateData(4);
584             appendOp(URX_LA_START, dataLoc);
585             appendOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+ 2);
586             appendOp(URX_JMP, fRXPat->fCompiledPat->size()+ 3);
587             appendOp(URX_LA_END, dataLoc);
588             appendOp(URX_BACKTRACK, 0);
589             appendOp(URX_NOP, 0);
590             appendOp(URX_NOP, 0);
591 
592             // On the Parentheses stack, start a new frame and add the positions
593             //   of the NOPs.
594             fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
595             fParenStack.push(lookAhead, *fStatus);                        // Frame type.
596             fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The first  NOP location
597             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP location
598         }
599         break;
600 
601     case doOpenLookAheadNeg:
602         // Negated Lookahead.   (?! stuff )
603         // Compiles to
604         //    1.    LA_START    dataloc
605         //    2.    SAVE_STATE  7         // Fail within look-ahead block restores to this state,
606         //                                //   which continues with the match.
607         //    3.    NOP                   // Std. Open Paren sequence, for possible '|'
608         //    4.       code for parenthesized stuff.
609         //    5.    LA_END                // Cut back stack, remove saved state from step 2.
610         //    6.    BACKTRACK             // code in block succeeded, so neg. lookahead fails.
611         //    7.    END_LA                // Restore match region, in case look-ahead was using
612         //                                        an alternate (transparent) region.
613         //  Four data slots are reserved, for saving state on entry to the look-around
614         //    0:   stack pointer on entry.
615         //    1:   input position on entry.
616         //    2:   fActiveStart, the active bounds start on entry.
617         //    3:   fActiveLimit, the active bounds limit on entry.
618         {
619             fixLiterals();
620             int32_t dataLoc = allocateData(4);
621             appendOp(URX_LA_START, dataLoc);
622             appendOp(URX_STATE_SAVE, 0);    // dest address will be patched later.
623             appendOp(URX_NOP, 0);
624 
625             // On the Parentheses stack, start a new frame and add the positions
626             //   of the StateSave and NOP.
627             fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
628             fParenStack.push(negLookAhead, *fStatus);                    // Frame type
629             fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The STATE_SAVE location
630             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP location
631 
632             // Instructions #5 - #7 will be added when the ')' is encountered.
633         }
634         break;
635 
636     case doOpenLookBehind:
637         {
638             //   Compile a (?<= look-behind open paren.
639             //
640             //          Compiles to
641             //              0       URX_LB_START     dataLoc
642             //              1       URX_LB_CONT      dataLoc
643             //              2                        MinMatchLen
644             //              3                        MaxMatchLen
645             //              4       URX_NOP          Standard '(' boilerplate.
646             //              5       URX_NOP          Reserved slot for use with '|' ops within (block).
647             //              6         <code for LookBehind expression>
648             //              7       URX_LB_END       dataLoc    # Check match len, restore input  len
649             //              8       URX_LA_END       dataLoc    # Restore stack, input pos
650             //
651             //          Allocate a block of matcher data, to contain (when running a match)
652             //              0:    Stack ptr on entry
653             //              1:    Input Index on entry
654             //              2:    fActiveStart, the active bounds start on entry.
655             //              3:    fActiveLimit, the active bounds limit on entry.
656             //              4:    Start index of match current match attempt.
657             //          The first four items must match the layout of data for LA_START / LA_END
658 
659             // Generate match code for any pending literals.
660             fixLiterals();
661 
662             // Allocate data space
663             int32_t dataLoc = allocateData(5);
664 
665             // Emit URX_LB_START
666             appendOp(URX_LB_START, dataLoc);
667 
668             // Emit URX_LB_CONT
669             appendOp(URX_LB_CONT, dataLoc);
670             appendOp(URX_RESERVED_OP, 0);    // MinMatchLength.  To be filled later.
671             appendOp(URX_RESERVED_OP, 0);    // MaxMatchLength.  To be filled later.
672 
673             // Emit the NOPs
674             appendOp(URX_NOP, 0);
675             appendOp(URX_NOP, 0);
676 
677             // On the Parentheses stack, start a new frame and add the positions
678             //   of the URX_LB_CONT and the NOP.
679             fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
680             fParenStack.push(lookBehind, *fStatus);                       // Frame type
681             fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The first NOP location
682             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The 2nd   NOP location
683 
684             // The final two instructions will be added when the ')' is encountered.
685         }
686 
687         break;
688 
689     case doOpenLookBehindNeg:
690         {
691             //   Compile a (?<! negated look-behind open paren.
692             //
693             //          Compiles to
694             //              0       URX_LB_START     dataLoc    # Save entry stack, input len
695             //              1       URX_LBN_CONT     dataLoc    # Iterate possible match positions
696             //              2                        MinMatchLen
697             //              3                        MaxMatchLen
698             //              4                        continueLoc (9)
699             //              5       URX_NOP          Standard '(' boilerplate.
700             //              6       URX_NOP          Reserved slot for use with '|' ops within (block).
701             //              7         <code for LookBehind expression>
702             //              8       URX_LBN_END      dataLoc    # Check match len, cause a FAIL
703             //              9       ...
704             //
705             //          Allocate a block of matcher data, to contain (when running a match)
706             //              0:    Stack ptr on entry
707             //              1:    Input Index on entry
708             //              2:    fActiveStart, the active bounds start on entry.
709             //              3:    fActiveLimit, the active bounds limit on entry.
710             //              4:    Start index of match current match attempt.
711             //          The first four items must match the layout of data for LA_START / LA_END
712 
713             // Generate match code for any pending literals.
714             fixLiterals();
715 
716             // Allocate data space
717             int32_t dataLoc = allocateData(5);
718 
719             // Emit URX_LB_START
720             appendOp(URX_LB_START, dataLoc);
721 
722             // Emit URX_LBN_CONT
723             appendOp(URX_LBN_CONT, dataLoc);
724             appendOp(URX_RESERVED_OP, 0);    // MinMatchLength.  To be filled later.
725             appendOp(URX_RESERVED_OP, 0);    // MaxMatchLength.  To be filled later.
726             appendOp(URX_RESERVED_OP, 0);    // Continue Loc.    To be filled later.
727 
728             // Emit the NOPs
729             appendOp(URX_NOP, 0);
730             appendOp(URX_NOP, 0);
731 
732             // On the Parentheses stack, start a new frame and add the positions
733             //   of the URX_LB_CONT and the NOP.
734             fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
735             fParenStack.push(lookBehindN, *fStatus);                      // Frame type
736             fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The first NOP location
737             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The 2nd   NOP location
738 
739             // The final two instructions will be added when the ')' is encountered.
740         }
741         break;
742 
743     case doConditionalExpr:
744         // Conditionals such as (?(1)a:b)
745     case doPerlInline:
746         // Perl inline-conditionals.  (?{perl code}a|b) We're not perl, no way to do them.
747         error(U_REGEX_UNIMPLEMENTED);
748         break;
749 
750 
751     case doCloseParen:
752         handleCloseParen();
753         if (fParenStack.size() <= 0) {
754             //  Extra close paren, or missing open paren.
755             error(U_REGEX_MISMATCHED_PAREN);
756         }
757         break;
758 
759     case doNOP:
760         break;
761 
762 
763     case doBadOpenParenType:
764     case doRuleError:
765         error(U_REGEX_RULE_SYNTAX);
766         break;
767 
768 
769     case doMismatchedParenErr:
770         error(U_REGEX_MISMATCHED_PAREN);
771         break;
772 
773     case doPlus:
774         //  Normal '+'  compiles to
775         //     1.   stuff to be repeated  (already built)
776         //     2.   jmp-sav 1
777         //     3.   ...
778         //
779         //  Or, if the item to be repeated can match a zero length string,
780         //     1.   STO_INP_LOC  data-loc
781         //     2.      body of stuff to be repeated
782         //     3.   JMP_SAV_X    2
783         //     4.   ...
784 
785         //
786         //  Or, if the item to be repeated is simple
787         //     1.   Item to be repeated.
788         //     2.   LOOP_SR_I    set number  (assuming repeated item is a set ref)
789         //     3.   LOOP_C       stack location
790         {
791             int32_t  topLoc = blockTopLoc(FALSE);        // location of item #1
792             int32_t  frameLoc;
793 
794             // Check for simple constructs, which may get special optimized code.
795             if (topLoc == fRXPat->fCompiledPat->size() - 1) {
796                 int32_t repeatedOp = (int32_t)fRXPat->fCompiledPat->elementAti(topLoc);
797 
798                 if (URX_TYPE(repeatedOp) == URX_SETREF) {
799                     // Emit optimized code for [char set]+
800                     appendOp(URX_LOOP_SR_I, URX_VAL(repeatedOp));
801                     frameLoc = allocateStackData(1);
802                     appendOp(URX_LOOP_C, frameLoc);
803                     break;
804                 }
805 
806                 if (URX_TYPE(repeatedOp) == URX_DOTANY ||
807                     URX_TYPE(repeatedOp) == URX_DOTANY_ALL ||
808                     URX_TYPE(repeatedOp) == URX_DOTANY_UNIX) {
809                     // Emit Optimized code for .+ operations.
810                     int32_t loopOpI = buildOp(URX_LOOP_DOT_I, 0);
811                     if (URX_TYPE(repeatedOp) == URX_DOTANY_ALL) {
812                         // URX_LOOP_DOT_I operand is a flag indicating ". matches any" mode.
813                         loopOpI |= 1;
814                     }
815                     if (fModeFlags & UREGEX_UNIX_LINES) {
816                         loopOpI |= 2;
817                     }
818                     appendOp(loopOpI);
819                     frameLoc = allocateStackData(1);
820                     appendOp(URX_LOOP_C, frameLoc);
821                     break;
822                 }
823 
824             }
825 
826             // General case.
827 
828             // Check for minimum match length of zero, which requires
829             //    extra loop-breaking code.
830             if (minMatchLength(topLoc, fRXPat->fCompiledPat->size()-1) == 0) {
831                 // Zero length match is possible.
832                 // Emit the code sequence that can handle it.
833                 insertOp(topLoc);
834                 frameLoc = allocateStackData(1);
835 
836                 int32_t op = buildOp(URX_STO_INP_LOC, frameLoc);
837                 fRXPat->fCompiledPat->setElementAt(op, topLoc);
838 
839                 appendOp(URX_JMP_SAV_X, topLoc+1);
840             } else {
841                 // Simpler code when the repeated body must match something non-empty
842                 appendOp(URX_JMP_SAV, topLoc);
843             }
844         }
845         break;
846 
847     case doNGPlus:
848         //  Non-greedy '+?'  compiles to
849         //     1.   stuff to be repeated  (already built)
850         //     2.   state-save  1
851         //     3.   ...
852         {
853             int32_t topLoc      = blockTopLoc(FALSE);
854             appendOp(URX_STATE_SAVE, topLoc);
855         }
856         break;
857 
858 
859     case doOpt:
860         // Normal (greedy) ? quantifier.
861         //  Compiles to
862         //     1. state save 3
863         //     2.    body of optional block
864         //     3. ...
865         // Insert the state save into the compiled pattern, and we're done.
866         {
867             int32_t   saveStateLoc = blockTopLoc(TRUE);
868             int32_t   saveStateOp  = buildOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size());
869             fRXPat->fCompiledPat->setElementAt(saveStateOp, saveStateLoc);
870         }
871         break;
872 
873     case doNGOpt:
874         // Non-greedy ?? quantifier
875         //   compiles to
876         //    1.  jmp   4
877         //    2.     body of optional block
878         //    3   jmp   5
879         //    4.  state save 2
880         //    5    ...
881         //  This code is less than ideal, with two jmps instead of one, because we can only
882         //  insert one instruction at the top of the block being iterated.
883         {
884             int32_t  jmp1_loc = blockTopLoc(TRUE);
885             int32_t  jmp2_loc = fRXPat->fCompiledPat->size();
886 
887             int32_t  jmp1_op  = buildOp(URX_JMP, jmp2_loc+1);
888             fRXPat->fCompiledPat->setElementAt(jmp1_op, jmp1_loc);
889 
890             appendOp(URX_JMP, jmp2_loc+2);
891 
892             appendOp(URX_STATE_SAVE, jmp1_loc+1);
893         }
894         break;
895 
896 
897     case doStar:
898         // Normal (greedy) * quantifier.
899         // Compiles to
900         //       1.   STATE_SAVE   4
901         //       2.      body of stuff being iterated over
902         //       3.   JMP_SAV      2
903         //       4.   ...
904         //
905         // Or, if the body is a simple [Set],
906         //       1.   LOOP_SR_I    set number
907         //       2.   LOOP_C       stack location
908         //       ...
909         //
910         // Or if this is a .*
911         //       1.   LOOP_DOT_I    (. matches all mode flag)
912         //       2.   LOOP_C        stack location
913         //
914         // Or, if the body can match a zero-length string, to inhibit infinite loops,
915         //       1.   STATE_SAVE   5
916         //       2.   STO_INP_LOC  data-loc
917         //       3.      body of stuff
918         //       4.   JMP_SAV_X    2
919         //       5.   ...
920         {
921             // location of item #1, the STATE_SAVE
922             int32_t   topLoc = blockTopLoc(FALSE);
923             int32_t   dataLoc = -1;
924 
925             // Check for simple *, where the construct being repeated
926             //   compiled to single opcode, and might be optimizable.
927             if (topLoc == fRXPat->fCompiledPat->size() - 1) {
928                 int32_t repeatedOp = (int32_t)fRXPat->fCompiledPat->elementAti(topLoc);
929 
930                 if (URX_TYPE(repeatedOp) == URX_SETREF) {
931                     // Emit optimized code for a [char set]*
932                     int32_t loopOpI = buildOp(URX_LOOP_SR_I, URX_VAL(repeatedOp));
933                     fRXPat->fCompiledPat->setElementAt(loopOpI, topLoc);
934                     dataLoc = allocateStackData(1);
935                     appendOp(URX_LOOP_C, dataLoc);
936                     break;
937                 }
938 
939                 if (URX_TYPE(repeatedOp) == URX_DOTANY ||
940                     URX_TYPE(repeatedOp) == URX_DOTANY_ALL ||
941                     URX_TYPE(repeatedOp) == URX_DOTANY_UNIX) {
942                     // Emit Optimized code for .* operations.
943                     int32_t loopOpI = buildOp(URX_LOOP_DOT_I, 0);
944                     if (URX_TYPE(repeatedOp) == URX_DOTANY_ALL) {
945                         // URX_LOOP_DOT_I operand is a flag indicating . matches any mode.
946                         loopOpI |= 1;
947                     }
948                     if ((fModeFlags & UREGEX_UNIX_LINES) != 0) {
949                         loopOpI |= 2;
950                     }
951                     fRXPat->fCompiledPat->setElementAt(loopOpI, topLoc);
952                     dataLoc = allocateStackData(1);
953                     appendOp(URX_LOOP_C, dataLoc);
954                     break;
955                 }
956             }
957 
958             // Emit general case code for this *
959             // The optimizations did not apply.
960 
961             int32_t   saveStateLoc = blockTopLoc(TRUE);
962             int32_t   jmpOp        = buildOp(URX_JMP_SAV, saveStateLoc+1);
963 
964             // Check for minimum match length of zero, which requires
965             //    extra loop-breaking code.
966             if (minMatchLength(saveStateLoc, fRXPat->fCompiledPat->size()-1) == 0) {
967                 insertOp(saveStateLoc);
968                 dataLoc = allocateStackData(1);
969 
970                 int32_t op = buildOp(URX_STO_INP_LOC, dataLoc);
971                 fRXPat->fCompiledPat->setElementAt(op, saveStateLoc+1);
972                 jmpOp      = buildOp(URX_JMP_SAV_X, saveStateLoc+2);
973             }
974 
975             // Locate the position in the compiled pattern where the match will continue
976             //   after completing the *.   (4 or 5 in the comment above)
977             int32_t continueLoc = fRXPat->fCompiledPat->size()+1;
978 
979             // Put together the save state op and store it into the compiled code.
980             int32_t saveStateOp = buildOp(URX_STATE_SAVE, continueLoc);
981             fRXPat->fCompiledPat->setElementAt(saveStateOp, saveStateLoc);
982 
983             // Append the URX_JMP_SAV or URX_JMPX operation to the compiled pattern.
984             appendOp(jmpOp);
985         }
986         break;
987 
988     case doNGStar:
989         // Non-greedy *? quantifier
990         // compiles to
991         //     1.   JMP    3
992         //     2.      body of stuff being iterated over
993         //     3.   STATE_SAVE  2
994         //     4    ...
995         {
996             int32_t     jmpLoc  = blockTopLoc(TRUE);                   // loc  1.
997             int32_t     saveLoc = fRXPat->fCompiledPat->size();        // loc  3.
998             int32_t     jmpOp   = buildOp(URX_JMP, saveLoc);
999             fRXPat->fCompiledPat->setElementAt(jmpOp, jmpLoc);
1000             appendOp(URX_STATE_SAVE, jmpLoc+1);
1001         }
1002         break;
1003 
1004 
1005     case doIntervalInit:
1006         // The '{' opening an interval quantifier was just scanned.
1007         // Init the counter variables that will accumulate the values as the digits
1008         //    are scanned.
1009         fIntervalLow = 0;
1010         fIntervalUpper = -1;
1011         break;
1012 
1013     case doIntevalLowerDigit:
1014         // Scanned a digit from the lower value of an {lower,upper} interval
1015         {
1016             int32_t digitValue = u_charDigitValue(fC.fChar);
1017             U_ASSERT(digitValue >= 0);
1018             int64_t val = (int64_t)fIntervalLow*10 + digitValue;
1019             if (val > INT32_MAX) {
1020                 error(U_REGEX_NUMBER_TOO_BIG);
1021             } else {
1022                 fIntervalLow = (int32_t)val;
1023             }
1024         }
1025         break;
1026 
1027     case doIntervalUpperDigit:
1028         // Scanned a digit from the upper value of an {lower,upper} interval
1029         {
1030             if (fIntervalUpper < 0) {
1031                 fIntervalUpper = 0;
1032             }
1033             int32_t digitValue = u_charDigitValue(fC.fChar);
1034             U_ASSERT(digitValue >= 0);
1035             int64_t val = (int64_t)fIntervalUpper*10 + digitValue;
1036             if (val > INT32_MAX) {
1037                 error(U_REGEX_NUMBER_TOO_BIG);
1038             } else {
1039                 fIntervalUpper = (int32_t)val;
1040             }
1041         }
1042         break;
1043 
1044     case doIntervalSame:
1045         // Scanned a single value interval like {27}.  Upper = Lower.
1046         fIntervalUpper = fIntervalLow;
1047         break;
1048 
1049     case doInterval:
1050         // Finished scanning a normal {lower,upper} interval.  Generate the code for it.
1051         if (compileInlineInterval() == FALSE) {
1052             compileInterval(URX_CTR_INIT, URX_CTR_LOOP);
1053         }
1054         break;
1055 
1056     case doPossessiveInterval:
1057         // Finished scanning a Possessive {lower,upper}+ interval.  Generate the code for it.
1058         {
1059             // Remember the loc for the top of the block being looped over.
1060             //   (Can not reserve a slot in the compiled pattern at this time, because
1061             //    compileInterval needs to reserve also, and blockTopLoc can only reserve
1062             //    once per block.)
1063             int32_t topLoc = blockTopLoc(FALSE);
1064 
1065             // Produce normal looping code.
1066             compileInterval(URX_CTR_INIT, URX_CTR_LOOP);
1067 
1068             // Surround the just-emitted normal looping code with a STO_SP ... LD_SP
1069             //  just as if the loop was inclosed in atomic parentheses.
1070 
1071             // First the STO_SP before the start of the loop
1072             insertOp(topLoc);
1073 
1074             int32_t  varLoc = allocateData(1);   // Reserve a data location for saving the
1075             int32_t  op     = buildOp(URX_STO_SP, varLoc);
1076             fRXPat->fCompiledPat->setElementAt(op, topLoc);
1077 
1078             int32_t loopOp = (int32_t)fRXPat->fCompiledPat->popi();
1079             U_ASSERT(URX_TYPE(loopOp) == URX_CTR_LOOP && URX_VAL(loopOp) == topLoc);
1080             loopOp++;     // point LoopOp after the just-inserted STO_SP
1081             fRXPat->fCompiledPat->push(loopOp, *fStatus);
1082 
1083             // Then the LD_SP after the end of the loop
1084             appendOp(URX_LD_SP, varLoc);
1085         }
1086 
1087         break;
1088 
1089     case doNGInterval:
1090         // Finished scanning a non-greedy {lower,upper}? interval.  Generate the code for it.
1091         compileInterval(URX_CTR_INIT_NG, URX_CTR_LOOP_NG);
1092         break;
1093 
1094     case doIntervalError:
1095         error(U_REGEX_BAD_INTERVAL);
1096         break;
1097 
1098     case doLiteralChar:
1099         // We've just scanned a "normal" character from the pattern,
1100         literalChar(fC.fChar);
1101         break;
1102 
1103 
1104     case doEscapedLiteralChar:
1105         // We've just scanned an backslashed escaped character with  no
1106         //   special meaning.  It represents itself.
1107         if ((fModeFlags & UREGEX_ERROR_ON_UNKNOWN_ESCAPES) != 0 &&
1108             ((fC.fChar >= 0x41 && fC.fChar<= 0x5A) ||     // in [A-Z]
1109             (fC.fChar >= 0x61 && fC.fChar <= 0x7a))) {   // in [a-z]
1110                error(U_REGEX_BAD_ESCAPE_SEQUENCE);
1111              }
1112         literalChar(fC.fChar);
1113         break;
1114 
1115 
1116     case doDotAny:
1117         // scanned a ".",  match any single character.
1118         {
1119             fixLiterals(FALSE);
1120             if (fModeFlags & UREGEX_DOTALL) {
1121                 appendOp(URX_DOTANY_ALL, 0);
1122             } else if (fModeFlags & UREGEX_UNIX_LINES) {
1123                 appendOp(URX_DOTANY_UNIX, 0);
1124             } else {
1125                 appendOp(URX_DOTANY, 0);
1126             }
1127         }
1128         break;
1129 
1130     case doCaret:
1131         {
1132             fixLiterals(FALSE);
1133             if (       (fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) {
1134                 appendOp(URX_CARET, 0);
1135             } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) {
1136                 appendOp(URX_CARET_M, 0);
1137             } else if ((fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) {
1138                 appendOp(URX_CARET, 0);   // Only testing true start of input.
1139             } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) {
1140                 appendOp(URX_CARET_M_UNIX, 0);
1141             }
1142         }
1143         break;
1144 
1145     case doDollar:
1146         {
1147             fixLiterals(FALSE);
1148             if (       (fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) {
1149                 appendOp(URX_DOLLAR, 0);
1150             } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) {
1151                 appendOp(URX_DOLLAR_M, 0);
1152             } else if ((fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) {
1153                 appendOp(URX_DOLLAR_D, 0);
1154             } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) {
1155                 appendOp(URX_DOLLAR_MD, 0);
1156             }
1157         }
1158         break;
1159 
1160     case doBackslashA:
1161         fixLiterals(FALSE);
1162         appendOp(URX_CARET, 0);
1163         break;
1164 
1165     case doBackslashB:
1166         {
1167             #if  UCONFIG_NO_BREAK_ITERATION==1
1168             if (fModeFlags & UREGEX_UWORD) {
1169                 error(U_UNSUPPORTED_ERROR);
1170             }
1171             #endif
1172             fixLiterals(FALSE);
1173             int32_t op = (fModeFlags & UREGEX_UWORD)? URX_BACKSLASH_BU : URX_BACKSLASH_B;
1174             appendOp(op, 1);
1175         }
1176         break;
1177 
1178     case doBackslashb:
1179         {
1180             #if  UCONFIG_NO_BREAK_ITERATION==1
1181             if (fModeFlags & UREGEX_UWORD) {
1182                 error(U_UNSUPPORTED_ERROR);
1183             }
1184             #endif
1185             fixLiterals(FALSE);
1186             int32_t op = (fModeFlags & UREGEX_UWORD)? URX_BACKSLASH_BU : URX_BACKSLASH_B;
1187             appendOp(op, 0);
1188         }
1189         break;
1190 
1191     case doBackslashD:
1192         fixLiterals(FALSE);
1193         appendOp(URX_BACKSLASH_D, 1);
1194         break;
1195 
1196     case doBackslashd:
1197         fixLiterals(FALSE);
1198         appendOp(URX_BACKSLASH_D, 0);
1199         break;
1200 
1201     case doBackslashG:
1202         fixLiterals(FALSE);
1203         appendOp(URX_BACKSLASH_G, 0);
1204         break;
1205 
1206     case doBackslashH:
1207         fixLiterals(FALSE);
1208         appendOp(URX_BACKSLASH_H, 1);
1209         break;
1210 
1211     case doBackslashh:
1212         fixLiterals(FALSE);
1213         appendOp(URX_BACKSLASH_H, 0);
1214         break;
1215 
1216     case doBackslashR:
1217         fixLiterals(FALSE);
1218         appendOp(URX_BACKSLASH_R, 0);
1219         break;
1220 
1221     case doBackslashS:
1222         fixLiterals(FALSE);
1223         appendOp(URX_STAT_SETREF_N, URX_ISSPACE_SET);
1224         break;
1225 
1226     case doBackslashs:
1227         fixLiterals(FALSE);
1228         appendOp(URX_STATIC_SETREF, URX_ISSPACE_SET);
1229         break;
1230 
1231     case doBackslashV:
1232         fixLiterals(FALSE);
1233         appendOp(URX_BACKSLASH_V, 1);
1234         break;
1235 
1236     case doBackslashv:
1237         fixLiterals(FALSE);
1238         appendOp(URX_BACKSLASH_V, 0);
1239         break;
1240 
1241     case doBackslashW:
1242         fixLiterals(FALSE);
1243         appendOp(URX_STAT_SETREF_N, URX_ISWORD_SET);
1244         break;
1245 
1246     case doBackslashw:
1247         fixLiterals(FALSE);
1248         appendOp(URX_STATIC_SETREF, URX_ISWORD_SET);
1249         break;
1250 
1251     case doBackslashX:
1252         #if  UCONFIG_NO_BREAK_ITERATION==1
1253         // Grapheme Cluster Boundary requires ICU break iteration.
1254         error(U_UNSUPPORTED_ERROR);
1255         #endif
1256         fixLiterals(FALSE);
1257         appendOp(URX_BACKSLASH_X, 0);
1258         break;
1259 
1260     case doBackslashZ:
1261         fixLiterals(FALSE);
1262         appendOp(URX_DOLLAR, 0);
1263         break;
1264 
1265     case doBackslashz:
1266         fixLiterals(FALSE);
1267         appendOp(URX_BACKSLASH_Z, 0);
1268         break;
1269 
1270     case doEscapeError:
1271         error(U_REGEX_BAD_ESCAPE_SEQUENCE);
1272         break;
1273 
1274     case doExit:
1275         fixLiterals(FALSE);
1276         returnVal = FALSE;
1277         break;
1278 
1279     case doProperty:
1280         {
1281             fixLiterals(FALSE);
1282             UnicodeSet *theSet = scanProp();
1283             compileSet(theSet);
1284         }
1285         break;
1286 
1287     case doNamedChar:
1288         {
1289             UChar32 c = scanNamedChar();
1290             literalChar(c);
1291         }
1292         break;
1293 
1294 
1295     case doBackRef:
1296         // BackReference.  Somewhat unusual in that the front-end can not completely parse
1297         //                 the regular expression, because the number of digits to be consumed
1298         //                 depends on the number of capture groups that have been defined.  So
1299         //                 we have to do it here instead.
1300         {
1301             int32_t  numCaptureGroups = fRXPat->fGroupMap->size();
1302             int32_t  groupNum = 0;
1303             UChar32  c        = fC.fChar;
1304 
1305             for (;;) {
1306                 // Loop once per digit, for max allowed number of digits in a back reference.
1307                 int32_t digit = u_charDigitValue(c);
1308                 groupNum = groupNum * 10 + digit;
1309                 if (groupNum >= numCaptureGroups) {
1310                     break;
1311                 }
1312                 c = peekCharLL();
1313                 if (RegexStaticSets::gStaticSets->fRuleDigitsAlias->contains(c) == FALSE) {
1314                     break;
1315                 }
1316                 nextCharLL();
1317             }
1318 
1319             // Scan of the back reference in the source regexp is complete.  Now generate
1320             //  the compiled code for it.
1321             // Because capture groups can be forward-referenced by back-references,
1322             //  we fill the operand with the capture group number.  At the end
1323             //  of compilation, it will be changed to the variable's location.
1324             U_ASSERT(groupNum > 0);  // Shouldn't happen.  '\0' begins an octal escape sequence,
1325                                      //    and shouldn't enter this code path at all.
1326             fixLiterals(FALSE);
1327             if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
1328                 appendOp(URX_BACKREF_I, groupNum);
1329             } else {
1330                 appendOp(URX_BACKREF, groupNum);
1331             }
1332         }
1333         break;
1334 
1335     case doBeginNamedBackRef:
1336         U_ASSERT(fCaptureName == NULL);
1337         fCaptureName = new UnicodeString;
1338         if (fCaptureName == NULL) {
1339             error(U_MEMORY_ALLOCATION_ERROR);
1340         }
1341         break;
1342 
1343     case doContinueNamedBackRef:
1344         fCaptureName->append(fC.fChar);
1345         break;
1346 
1347     case doCompleteNamedBackRef:
1348         {
1349         int32_t groupNumber =
1350             fRXPat->fNamedCaptureMap ? uhash_geti(fRXPat->fNamedCaptureMap, fCaptureName) : 0;
1351         if (groupNumber == 0) {
1352             // Group name has not been defined.
1353             //   Could be a forward reference. If we choose to support them at some
1354             //   future time, extra mechanism will be required at this point.
1355             error(U_REGEX_INVALID_CAPTURE_GROUP_NAME);
1356         } else {
1357             // Given the number, handle identically to a \n numbered back reference.
1358             // See comments above, under doBackRef
1359             fixLiterals(FALSE);
1360             if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
1361                 appendOp(URX_BACKREF_I, groupNumber);
1362             } else {
1363                 appendOp(URX_BACKREF, groupNumber);
1364             }
1365         }
1366         delete fCaptureName;
1367         fCaptureName = NULL;
1368         break;
1369         }
1370 
1371     case doPossessivePlus:
1372         // Possessive ++ quantifier.
1373         // Compiles to
1374         //       1.   STO_SP
1375         //       2.      body of stuff being iterated over
1376         //       3.   STATE_SAVE 5
1377         //       4.   JMP        2
1378         //       5.   LD_SP
1379         //       6.   ...
1380         //
1381         //  Note:  TODO:  This is pretty inefficient.  A mass of saved state is built up
1382         //                then unconditionally discarded.  Perhaps introduce a new opcode.  Ticket 6056
1383         //
1384         {
1385             // Emit the STO_SP
1386             int32_t   topLoc = blockTopLoc(TRUE);
1387             int32_t   stoLoc = allocateData(1);  // Reserve the data location for storing save stack ptr.
1388             int32_t   op     = buildOp(URX_STO_SP, stoLoc);
1389             fRXPat->fCompiledPat->setElementAt(op, topLoc);
1390 
1391             // Emit the STATE_SAVE
1392             appendOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+2);
1393 
1394             // Emit the JMP
1395             appendOp(URX_JMP, topLoc+1);
1396 
1397             // Emit the LD_SP
1398             appendOp(URX_LD_SP, stoLoc);
1399         }
1400         break;
1401 
1402     case doPossessiveStar:
1403         // Possessive *+ quantifier.
1404         // Compiles to
1405         //       1.   STO_SP       loc
1406         //       2.   STATE_SAVE   5
1407         //       3.      body of stuff being iterated over
1408         //       4.   JMP          2
1409         //       5.   LD_SP        loc
1410         //       6    ...
1411         // TODO:  do something to cut back the state stack each time through the loop.
1412         {
1413             // Reserve two slots at the top of the block.
1414             int32_t   topLoc = blockTopLoc(TRUE);
1415             insertOp(topLoc);
1416 
1417             // emit   STO_SP     loc
1418             int32_t   stoLoc = allocateData(1);    // Reserve the data location for storing save stack ptr.
1419             int32_t   op     = buildOp(URX_STO_SP, stoLoc);
1420             fRXPat->fCompiledPat->setElementAt(op, topLoc);
1421 
1422             // Emit the SAVE_STATE   5
1423             int32_t L7 = fRXPat->fCompiledPat->size()+1;
1424             op = buildOp(URX_STATE_SAVE, L7);
1425             fRXPat->fCompiledPat->setElementAt(op, topLoc+1);
1426 
1427             // Append the JMP operation.
1428             appendOp(URX_JMP, topLoc+1);
1429 
1430             // Emit the LD_SP       loc
1431             appendOp(URX_LD_SP, stoLoc);
1432         }
1433         break;
1434 
1435     case doPossessiveOpt:
1436         // Possessive  ?+ quantifier.
1437         //  Compiles to
1438         //     1. STO_SP      loc
1439         //     2. SAVE_STATE  5
1440         //     3.    body of optional block
1441         //     4. LD_SP       loc
1442         //     5. ...
1443         //
1444         {
1445             // Reserve two slots at the top of the block.
1446             int32_t   topLoc = blockTopLoc(TRUE);
1447             insertOp(topLoc);
1448 
1449             // Emit the STO_SP
1450             int32_t   stoLoc = allocateData(1);   // Reserve the data location for storing save stack ptr.
1451             int32_t   op     = buildOp(URX_STO_SP, stoLoc);
1452             fRXPat->fCompiledPat->setElementAt(op, topLoc);
1453 
1454             // Emit the SAVE_STATE
1455             int32_t   continueLoc = fRXPat->fCompiledPat->size()+1;
1456             op = buildOp(URX_STATE_SAVE, continueLoc);
1457             fRXPat->fCompiledPat->setElementAt(op, topLoc+1);
1458 
1459             // Emit the LD_SP
1460             appendOp(URX_LD_SP, stoLoc);
1461         }
1462         break;
1463 
1464 
1465     case doBeginMatchMode:
1466         fNewModeFlags = fModeFlags;
1467         fSetModeFlag  = TRUE;
1468         break;
1469 
1470     case doMatchMode:   //  (?i)    and similar
1471         {
1472             int32_t  bit = 0;
1473             switch (fC.fChar) {
1474             case 0x69: /* 'i' */   bit = UREGEX_CASE_INSENSITIVE; break;
1475             case 0x64: /* 'd' */   bit = UREGEX_UNIX_LINES;       break;
1476             case 0x6d: /* 'm' */   bit = UREGEX_MULTILINE;        break;
1477             case 0x73: /* 's' */   bit = UREGEX_DOTALL;           break;
1478             case 0x75: /* 'u' */   bit = 0; /* Unicode casing */  break;
1479             case 0x77: /* 'w' */   bit = UREGEX_UWORD;            break;
1480             case 0x78: /* 'x' */   bit = UREGEX_COMMENTS;         break;
1481             case 0x2d: /* '-' */   fSetModeFlag = FALSE;          break;
1482             default:
1483                 UPRV_UNREACHABLE_EXIT;  // Should never happen.  Other chars are filtered out
1484                                         // by the scanner.
1485             }
1486             if (fSetModeFlag) {
1487                 fNewModeFlags |= bit;
1488             } else {
1489                 fNewModeFlags &= ~bit;
1490             }
1491         }
1492         break;
1493 
1494     case doSetMatchMode:
1495         // Emit code to match any pending literals, using the not-yet changed match mode.
1496         fixLiterals();
1497 
1498         // We've got a (?i) or similar.  The match mode is being changed, but
1499         //   the change is not scoped to a parenthesized block.
1500         U_ASSERT(fNewModeFlags < 0);
1501         fModeFlags = fNewModeFlags;
1502 
1503         break;
1504 
1505 
1506     case doMatchModeParen:
1507         // We've got a (?i: or similar.  Begin a parenthesized block, save old
1508         //   mode flags so they can be restored at the close of the block.
1509         //
1510         //   Compile to a
1511         //      - NOP, which later may be replaced by a save-state if the
1512         //         parenthesized group gets a * quantifier, followed by
1513         //      - NOP, which may later be replaced by a save-state if there
1514         //             is an '|' alternation within the parens.
1515         {
1516             fixLiterals(FALSE);
1517             appendOp(URX_NOP, 0);
1518             appendOp(URX_NOP, 0);
1519 
1520             // On the Parentheses stack, start a new frame and add the positions
1521             //   of the two NOPs (a normal non-capturing () frame, except for the
1522             //   saving of the original mode flags.)
1523             fParenStack.push(fModeFlags, *fStatus);
1524             fParenStack.push(flags, *fStatus);                            // Frame Marker
1525             fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The first NOP
1526             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP
1527 
1528             // Set the current mode flags to the new values.
1529             U_ASSERT(fNewModeFlags < 0);
1530             fModeFlags = fNewModeFlags;
1531         }
1532         break;
1533 
1534     case doBadModeFlag:
1535         error(U_REGEX_INVALID_FLAG);
1536         break;
1537 
1538     case doSuppressComments:
1539         // We have just scanned a '(?'.  We now need to prevent the character scanner from
1540         // treating a '#' as a to-the-end-of-line comment.
1541         //   (This Perl compatibility just gets uglier and uglier to do...)
1542         fEOLComments = FALSE;
1543         break;
1544 
1545 
1546     case doSetAddAmp:
1547         {
1548           UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1549           set->add(chAmp);
1550         }
1551         break;
1552 
1553     case doSetAddDash:
1554         {
1555           UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1556           set->add(chDash);
1557         }
1558         break;
1559 
1560      case doSetBackslash_s:
1561         {
1562          UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1563          set->addAll(RegexStaticSets::gStaticSets->fPropSets[URX_ISSPACE_SET]);
1564          break;
1565         }
1566 
1567      case doSetBackslash_S:
1568         {
1569             UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1570             UnicodeSet SSet;
1571             SSet.addAll(RegexStaticSets::gStaticSets->fPropSets[URX_ISSPACE_SET]).complement();
1572             set->addAll(SSet);
1573             break;
1574         }
1575 
1576     case doSetBackslash_d:
1577         {
1578             UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1579             // TODO - make a static set, ticket 6058.
1580             addCategory(set, U_GC_ND_MASK, *fStatus);
1581             break;
1582         }
1583 
1584     case doSetBackslash_D:
1585         {
1586             UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1587             UnicodeSet digits;
1588             // TODO - make a static set, ticket 6058.
1589             digits.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ND_MASK, *fStatus);
1590             digits.complement();
1591             set->addAll(digits);
1592             break;
1593         }
1594 
1595     case doSetBackslash_h:
1596         {
1597             UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1598             UnicodeSet h;
1599             h.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ZS_MASK, *fStatus);
1600             h.add((UChar32)9);   // Tab
1601             set->addAll(h);
1602             break;
1603         }
1604 
1605     case doSetBackslash_H:
1606         {
1607             UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1608             UnicodeSet h;
1609             h.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ZS_MASK, *fStatus);
1610             h.add((UChar32)9);   // Tab
1611             h.complement();
1612             set->addAll(h);
1613             break;
1614         }
1615 
1616     case doSetBackslash_v:
1617         {
1618             UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1619             set->add((UChar32)0x0a, (UChar32)0x0d);  // add range
1620             set->add((UChar32)0x85);
1621             set->add((UChar32)0x2028, (UChar32)0x2029);
1622             break;
1623         }
1624 
1625     case doSetBackslash_V:
1626         {
1627             UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1628             UnicodeSet v;
1629             v.add((UChar32)0x0a, (UChar32)0x0d);  // add range
1630             v.add((UChar32)0x85);
1631             v.add((UChar32)0x2028, (UChar32)0x2029);
1632             v.complement();
1633             set->addAll(v);
1634             break;
1635         }
1636 
1637     case doSetBackslash_w:
1638         {
1639             UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1640             set->addAll(RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET]);
1641             break;
1642         }
1643 
1644     case doSetBackslash_W:
1645         {
1646             UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1647             UnicodeSet SSet;
1648             SSet.addAll(RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET]).complement();
1649             set->addAll(SSet);
1650             break;
1651         }
1652 
1653     case doSetBegin:
1654         {
1655             fixLiterals(FALSE);
1656             LocalPointer<UnicodeSet> lpSet(new UnicodeSet(), *fStatus);
1657             fSetStack.push(lpSet.orphan(), *fStatus);
1658             fSetOpStack.push(setStart, *fStatus);
1659             if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) {
1660                 fSetOpStack.push(setCaseClose, *fStatus);
1661             }
1662             break;
1663         }
1664 
1665     case doSetBeginDifference1:
1666         //  We have scanned something like [[abc]-[
1667         //  Set up a new UnicodeSet for the set beginning with the just-scanned '['
1668         //  Push a Difference operator, which will cause the new set to be subtracted from what
1669         //    went before once it is created.
1670         setPushOp(setDifference1);
1671         fSetOpStack.push(setStart, *fStatus);
1672         if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) {
1673             fSetOpStack.push(setCaseClose, *fStatus);
1674         }
1675         break;
1676 
1677     case doSetBeginIntersection1:
1678         //  We have scanned something like  [[abc]&[
1679         //   Need both the '&' operator and the open '[' operator.
1680         setPushOp(setIntersection1);
1681         fSetOpStack.push(setStart, *fStatus);
1682         if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) {
1683             fSetOpStack.push(setCaseClose, *fStatus);
1684         }
1685         break;
1686 
1687     case doSetBeginUnion:
1688         //  We have scanned something like  [[abc][
1689         //     Need to handle the union operation explicitly [[abc] | [
1690         setPushOp(setUnion);
1691         fSetOpStack.push(setStart, *fStatus);
1692         if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) {
1693             fSetOpStack.push(setCaseClose, *fStatus);
1694         }
1695         break;
1696 
1697     case doSetDifference2:
1698         // We have scanned something like [abc--
1699         //   Consider this to unambiguously be a set difference operator.
1700         setPushOp(setDifference2);
1701         break;
1702 
1703     case doSetEnd:
1704         // Have encountered the ']' that closes a set.
1705         //    Force the evaluation of any pending operations within this set,
1706         //    leave the completed set on the top of the set stack.
1707         setEval(setEnd);
1708         U_ASSERT(fSetOpStack.peeki()==setStart);
1709         fSetOpStack.popi();
1710         break;
1711 
1712     case doSetFinish:
1713         {
1714         // Finished a complete set expression, including all nested sets.
1715         //   The close bracket has already triggered clearing out pending set operators,
1716         //    the operator stack should be empty and the operand stack should have just
1717         //    one entry, the result set.
1718         U_ASSERT(fSetOpStack.empty());
1719         UnicodeSet *theSet = (UnicodeSet *)fSetStack.pop();
1720         U_ASSERT(fSetStack.empty());
1721         compileSet(theSet);
1722         break;
1723         }
1724 
1725     case doSetIntersection2:
1726         // Have scanned something like [abc&&
1727         setPushOp(setIntersection2);
1728         break;
1729 
1730     case doSetLiteral:
1731         // Union the just-scanned literal character into the set being built.
1732         //    This operation is the highest precedence set operation, so we can always do
1733         //    it immediately, without waiting to see what follows.  It is necessary to perform
1734         //    any pending '-' or '&' operation first, because these have the same precedence
1735         //    as union-ing in a literal'
1736         {
1737             setEval(setUnion);
1738             UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
1739             s->add(fC.fChar);
1740             fLastSetLiteral = fC.fChar;
1741             break;
1742         }
1743 
1744     case doSetLiteralEscaped:
1745         // A back-slash escaped literal character was encountered.
1746         // Processing is the same as with setLiteral, above, with the addition of
1747         //  the optional check for errors on escaped ASCII letters.
1748         {
1749             if ((fModeFlags & UREGEX_ERROR_ON_UNKNOWN_ESCAPES) != 0 &&
1750                 ((fC.fChar >= 0x41 && fC.fChar<= 0x5A) ||     // in [A-Z]
1751                  (fC.fChar >= 0x61 && fC.fChar <= 0x7a))) {   // in [a-z]
1752                 error(U_REGEX_BAD_ESCAPE_SEQUENCE);
1753             }
1754             setEval(setUnion);
1755             UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
1756             s->add(fC.fChar);
1757             fLastSetLiteral = fC.fChar;
1758             break;
1759         }
1760 
1761         case doSetNamedChar:
1762         // Scanning a \N{UNICODE CHARACTER NAME}
1763         //  Aside from the source of the character, the processing is identical to doSetLiteral,
1764         //    above.
1765         {
1766             UChar32  c = scanNamedChar();
1767             setEval(setUnion);
1768             UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
1769             s->add(c);
1770             fLastSetLiteral = c;
1771             break;
1772         }
1773 
1774     case doSetNamedRange:
1775         // We have scanned literal-\N{CHAR NAME}.  Add the range to the set.
1776         // The left character is already in the set, and is saved in fLastSetLiteral.
1777         // The right side needs to be picked up, the scan is at the 'N'.
1778         // Lower Limit > Upper limit being an error matches both Java
1779         //        and ICU UnicodeSet behavior.
1780         {
1781             UChar32  c = scanNamedChar();
1782             if (U_SUCCESS(*fStatus) && (fLastSetLiteral == U_SENTINEL || fLastSetLiteral > c)) {
1783                 error(U_REGEX_INVALID_RANGE);
1784             }
1785             UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
1786             s->add(fLastSetLiteral, c);
1787             fLastSetLiteral = c;
1788             break;
1789         }
1790 
1791 
1792     case  doSetNegate:
1793         // Scanned a '^' at the start of a set.
1794         // Push the negation operator onto the set op stack.
1795         // A twist for case-insensitive matching:
1796         //   the case closure operation must happen _before_ negation.
1797         //   But the case closure operation will already be on the stack if it's required.
1798         //   This requires checking for case closure, and swapping the stack order
1799         //    if it is present.
1800         {
1801             int32_t  tosOp = fSetOpStack.peeki();
1802             if (tosOp == setCaseClose) {
1803                 fSetOpStack.popi();
1804                 fSetOpStack.push(setNegation, *fStatus);
1805                 fSetOpStack.push(setCaseClose, *fStatus);
1806             } else {
1807                 fSetOpStack.push(setNegation, *fStatus);
1808             }
1809         }
1810         break;
1811 
1812     case doSetNoCloseError:
1813         error(U_REGEX_MISSING_CLOSE_BRACKET);
1814         break;
1815 
1816     case doSetOpError:
1817         error(U_REGEX_RULE_SYNTAX);   //  -- or && at the end of a set.  Illegal.
1818         break;
1819 
1820     case doSetPosixProp:
1821         {
1822             UnicodeSet *s = scanPosixProp();
1823             if (s != NULL) {
1824                 UnicodeSet *tos = (UnicodeSet *)fSetStack.peek();
1825                 tos->addAll(*s);
1826                 delete s;
1827             }  // else error.  scanProp() reported the error status already.
1828         }
1829         break;
1830 
1831     case doSetProp:
1832         //  Scanned a \p \P within [brackets].
1833         {
1834             UnicodeSet *s = scanProp();
1835             if (s != NULL) {
1836                 UnicodeSet *tos = (UnicodeSet *)fSetStack.peek();
1837                 tos->addAll(*s);
1838                 delete s;
1839             }  // else error.  scanProp() reported the error status already.
1840         }
1841         break;
1842 
1843 
1844     case doSetRange:
1845         // We have scanned literal-literal.  Add the range to the set.
1846         // The left character is already in the set, and is saved in fLastSetLiteral.
1847         // The right side is the current character.
1848         // Lower Limit > Upper limit being an error matches both Java
1849         //        and ICU UnicodeSet behavior.
1850         {
1851 
1852         if (fLastSetLiteral == U_SENTINEL || fLastSetLiteral > fC.fChar) {
1853             error(U_REGEX_INVALID_RANGE);
1854         }
1855         UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
1856         s->add(fLastSetLiteral, fC.fChar);
1857         break;
1858         }
1859 
1860     default:
1861         UPRV_UNREACHABLE_EXIT;
1862     }
1863 
1864     if (U_FAILURE(*fStatus)) {
1865         returnVal = FALSE;
1866     }
1867 
1868     return returnVal;
1869 }
1870 
1871 
1872 
1873 //------------------------------------------------------------------------------
1874 //
1875 //   literalChar           We've encountered a literal character from the pattern,
1876 //                             or an escape sequence that reduces to a character.
1877 //                         Add it to the string containing all literal chars/strings from
1878 //                             the pattern.
1879 //
1880 //------------------------------------------------------------------------------
literalChar(UChar32 c)1881 void RegexCompile::literalChar(UChar32 c)  {
1882     fLiteralChars.append(c);
1883 }
1884 
1885 
1886 //------------------------------------------------------------------------------
1887 //
1888 //    fixLiterals           When compiling something that can follow a literal
1889 //                          string in a pattern, emit the code to match the
1890 //                          accumulated literal string.
1891 //
1892 //                          Optionally, split the last char of the string off into
1893 //                          a single "ONE_CHAR" operation, so that quantifiers can
1894 //                          apply to that char alone.  Example:   abc*
1895 //                          The * must apply to the 'c' only.
1896 //
1897 //------------------------------------------------------------------------------
fixLiterals(UBool split)1898 void    RegexCompile::fixLiterals(UBool split) {
1899 
1900     // If no literal characters have been scanned but not yet had code generated
1901     //   for them, nothing needs to be done.
1902     if (fLiteralChars.length() == 0) {
1903         return;
1904     }
1905 
1906     int32_t indexOfLastCodePoint = fLiteralChars.moveIndex32(fLiteralChars.length(), -1);
1907     UChar32 lastCodePoint = fLiteralChars.char32At(indexOfLastCodePoint);
1908 
1909     // Split:  We need to  ensure that the last item in the compiled pattern
1910     //     refers only to the last literal scanned in the pattern, so that
1911     //     quantifiers (*, +, etc.) affect only it, and not a longer string.
1912     //     Split before case folding for case insensitive matches.
1913 
1914     if (split) {
1915         fLiteralChars.truncate(indexOfLastCodePoint);
1916         fixLiterals(FALSE);   // Recursive call, emit code to match the first part of the string.
1917                               //  Note that the truncated literal string may be empty, in which case
1918                               //  nothing will be emitted.
1919 
1920         literalChar(lastCodePoint);  // Re-add the last code point as if it were a new literal.
1921         fixLiterals(FALSE);          // Second recursive call, code for the final code point.
1922         return;
1923     }
1924 
1925     // If we are doing case-insensitive matching, case fold the string.  This may expand
1926     //   the string, e.g. the German sharp-s turns into "ss"
1927     if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
1928         fLiteralChars.foldCase();
1929         indexOfLastCodePoint = fLiteralChars.moveIndex32(fLiteralChars.length(), -1);
1930         lastCodePoint = fLiteralChars.char32At(indexOfLastCodePoint);
1931     }
1932 
1933     if (indexOfLastCodePoint == 0) {
1934         // Single character, emit a URX_ONECHAR op to match it.
1935         if ((fModeFlags & UREGEX_CASE_INSENSITIVE) &&
1936                  u_hasBinaryProperty(lastCodePoint, UCHAR_CASE_SENSITIVE)) {
1937             appendOp(URX_ONECHAR_I, lastCodePoint);
1938         } else {
1939             appendOp(URX_ONECHAR, lastCodePoint);
1940         }
1941     } else {
1942         // Two or more chars, emit a URX_STRING to match them.
1943         if (fLiteralChars.length() > 0x00ffffff || fRXPat->fLiteralText.length() > 0x00ffffff) {
1944             error(U_REGEX_PATTERN_TOO_BIG);
1945         }
1946         if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
1947             appendOp(URX_STRING_I, fRXPat->fLiteralText.length());
1948         } else {
1949             // TODO here:  add optimization to split case sensitive strings of length two
1950             //             into two single char ops, for efficiency.
1951             appendOp(URX_STRING, fRXPat->fLiteralText.length());
1952         }
1953         appendOp(URX_STRING_LEN, fLiteralChars.length());
1954 
1955         // Add this string into the accumulated strings of the compiled pattern.
1956         fRXPat->fLiteralText.append(fLiteralChars);
1957     }
1958 
1959     fLiteralChars.remove();
1960 }
1961 
1962 
buildOp(int32_t type,int32_t val)1963 int32_t RegexCompile::buildOp(int32_t type, int32_t val) {
1964     if (U_FAILURE(*fStatus)) {
1965         return 0;
1966     }
1967     if (type < 0 || type > 255) {
1968         UPRV_UNREACHABLE_EXIT;
1969     }
1970     if (val > 0x00ffffff) {
1971         UPRV_UNREACHABLE_EXIT;
1972     }
1973     if (val < 0) {
1974         if (!(type == URX_RESERVED_OP_N || type == URX_RESERVED_OP)) {
1975             UPRV_UNREACHABLE_EXIT;
1976         }
1977         if (URX_TYPE(val) != 0xff) {
1978             UPRV_UNREACHABLE_EXIT;
1979         }
1980         type = URX_RESERVED_OP_N;
1981     }
1982     return (type << 24) | val;
1983 }
1984 
1985 
1986 //------------------------------------------------------------------------------
1987 //
1988 //   appendOp()             Append a new instruction onto the compiled pattern
1989 //                          Includes error checking, limiting the size of the
1990 //                          pattern to lengths that can be represented in the
1991 //                          24 bit operand field of an instruction.
1992 //
1993 //------------------------------------------------------------------------------
appendOp(int32_t op)1994 void RegexCompile::appendOp(int32_t op) {
1995     if (U_FAILURE(*fStatus)) {
1996         return;
1997     }
1998     fRXPat->fCompiledPat->addElement(op, *fStatus);
1999     if ((fRXPat->fCompiledPat->size() > 0x00fffff0) && U_SUCCESS(*fStatus)) {
2000         error(U_REGEX_PATTERN_TOO_BIG);
2001     }
2002 }
2003 
appendOp(int32_t type,int32_t val)2004 void RegexCompile::appendOp(int32_t type, int32_t val) {
2005     appendOp(buildOp(type, val));
2006 }
2007 
2008 
2009 //------------------------------------------------------------------------------
2010 //
2011 //   insertOp()             Insert a slot for a new opcode into the already
2012 //                          compiled pattern code.
2013 //
2014 //                          Fill the slot with a NOP.  Our caller will replace it
2015 //                          with what they really wanted.
2016 //
2017 //------------------------------------------------------------------------------
insertOp(int32_t where)2018 void   RegexCompile::insertOp(int32_t where) {
2019     UVector64 *code = fRXPat->fCompiledPat;
2020     U_ASSERT(where>0 && where < code->size());
2021 
2022     int32_t  nop = buildOp(URX_NOP, 0);
2023     code->insertElementAt(nop, where, *fStatus);
2024 
2025     // Walk through the pattern, looking for any ops with targets that
2026     //  were moved down by the insert.  Fix them.
2027     int32_t loc;
2028     for (loc=0; loc<code->size(); loc++) {
2029         int32_t op = (int32_t)code->elementAti(loc);
2030         int32_t opType = URX_TYPE(op);
2031         int32_t opValue = URX_VAL(op);
2032         if ((opType == URX_JMP         ||
2033             opType == URX_JMPX         ||
2034             opType == URX_STATE_SAVE   ||
2035             opType == URX_CTR_LOOP     ||
2036             opType == URX_CTR_LOOP_NG  ||
2037             opType == URX_JMP_SAV      ||
2038             opType == URX_JMP_SAV_X    ||
2039             opType == URX_RELOC_OPRND)    && opValue > where) {
2040             // Target location for this opcode is after the insertion point and
2041             //   needs to be incremented to adjust for the insertion.
2042             opValue++;
2043             op = buildOp(opType, opValue);
2044             code->setElementAt(op, loc);
2045         }
2046     }
2047 
2048     // Now fix up the parentheses stack.  All positive values in it are locations in
2049     //  the compiled pattern.   (Negative values are frame boundaries, and don't need fixing.)
2050     for (loc=0; loc<fParenStack.size(); loc++) {
2051         int32_t x = fParenStack.elementAti(loc);
2052         U_ASSERT(x < code->size());
2053         if (x>where) {
2054             x++;
2055             fParenStack.setElementAt(x, loc);
2056         }
2057     }
2058 
2059     if (fMatchCloseParen > where) {
2060         fMatchCloseParen++;
2061     }
2062     if (fMatchOpenParen > where) {
2063         fMatchOpenParen++;
2064     }
2065 }
2066 
2067 
2068 //------------------------------------------------------------------------------
2069 //
2070 //   allocateData()        Allocate storage in the matcher's static data area.
2071 //                         Return the index for the newly allocated data.
2072 //                         The storage won't actually exist until we are running a match
2073 //                         operation, but the storage indexes are inserted into various
2074 //                         opcodes while compiling the pattern.
2075 //
2076 //------------------------------------------------------------------------------
allocateData(int32_t size)2077 int32_t RegexCompile::allocateData(int32_t size) {
2078     if (U_FAILURE(*fStatus)) {
2079         return 0;
2080     }
2081     if (size <= 0 || size > 0x100 || fRXPat->fDataSize < 0) {
2082         error(U_REGEX_INTERNAL_ERROR);
2083         return 0;
2084     }
2085     int32_t dataIndex = fRXPat->fDataSize;
2086     fRXPat->fDataSize += size;
2087     if (fRXPat->fDataSize >= 0x00fffff0) {
2088         error(U_REGEX_INTERNAL_ERROR);
2089     }
2090     return dataIndex;
2091 }
2092 
2093 
2094 //------------------------------------------------------------------------------
2095 //
2096 //   allocateStackData()   Allocate space in the back-tracking stack frame.
2097 //                         Return the index for the newly allocated data.
2098 //                         The frame indexes are inserted into various
2099 //                         opcodes while compiling the pattern, meaning that frame
2100 //                         size must be restricted to the size that will fit
2101 //                         as an operand (24 bits).
2102 //
2103 //------------------------------------------------------------------------------
allocateStackData(int32_t size)2104 int32_t RegexCompile::allocateStackData(int32_t size) {
2105     if (U_FAILURE(*fStatus)) {
2106         return 0;
2107     }
2108     if (size <= 0 || size > 0x100 || fRXPat->fFrameSize < 0) {
2109         error(U_REGEX_INTERNAL_ERROR);
2110         return 0;
2111     }
2112     int32_t dataIndex = fRXPat->fFrameSize;
2113     fRXPat->fFrameSize += size;
2114     if (fRXPat->fFrameSize >= 0x00fffff0) {
2115         error(U_REGEX_PATTERN_TOO_BIG);
2116     }
2117     return dataIndex;
2118 }
2119 
2120 
2121 //------------------------------------------------------------------------------
2122 //
2123 //   blockTopLoc()          Find or create a location in the compiled pattern
2124 //                          at the start of the operation or block that has
2125 //                          just been compiled.  Needed when a quantifier (* or
2126 //                          whatever) appears, and we need to add an operation
2127 //                          at the start of the thing being quantified.
2128 //
2129 //                          (Parenthesized Blocks) have a slot with a NOP that
2130 //                          is reserved for this purpose.  .* or similar don't
2131 //                          and a slot needs to be added.
2132 //
2133 //       parameter reserveLoc   :  TRUE -  ensure that there is space to add an opcode
2134 //                                         at the returned location.
2135 //                                 FALSE - just return the address,
2136 //                                         do not reserve a location there.
2137 //
2138 //------------------------------------------------------------------------------
blockTopLoc(UBool reserveLoc)2139 int32_t   RegexCompile::blockTopLoc(UBool reserveLoc) {
2140     int32_t   theLoc;
2141     fixLiterals(TRUE);  // Emit code for any pending literals.
2142                         //   If last item was a string, emit separate op for the its last char.
2143     if (fRXPat->fCompiledPat->size() == fMatchCloseParen)
2144     {
2145         // The item just processed is a parenthesized block.
2146         theLoc = fMatchOpenParen;   // A slot is already reserved for us.
2147         U_ASSERT(theLoc > 0);
2148         U_ASSERT(URX_TYPE(((uint32_t)fRXPat->fCompiledPat->elementAti(theLoc))) == URX_NOP);
2149     }
2150     else {
2151         // Item just compiled is a single thing, a ".", or a single char, a string or a set reference.
2152         // No slot for STATE_SAVE was pre-reserved in the compiled code.
2153         // We need to make space now.
2154         theLoc = fRXPat->fCompiledPat->size()-1;
2155         int32_t opAtTheLoc = (int32_t)fRXPat->fCompiledPat->elementAti(theLoc);
2156         if (URX_TYPE(opAtTheLoc) == URX_STRING_LEN) {
2157             // Strings take two opcode, we want the position of the first one.
2158             // We can have a string at this point if a single character case-folded to two.
2159             theLoc--;
2160         }
2161         if (reserveLoc) {
2162             int32_t  nop = buildOp(URX_NOP, 0);
2163             fRXPat->fCompiledPat->insertElementAt(nop, theLoc, *fStatus);
2164         }
2165     }
2166     return theLoc;
2167 }
2168 
2169 
2170 
2171 //------------------------------------------------------------------------------
2172 //
2173 //    handleCloseParen      When compiling a close paren, we need to go back
2174 //                          and fix up any JMP or SAVE operations within the
2175 //                          parenthesized block that need to target the end
2176 //                          of the block.  The locations of these are kept on
2177 //                          the paretheses stack.
2178 //
2179 //                          This function is called both when encountering a
2180 //                          real ) and at the end of the pattern.
2181 //
2182 //------------------------------------------------------------------------------
handleCloseParen()2183 void  RegexCompile::handleCloseParen() {
2184     int32_t   patIdx;
2185     int32_t   patOp;
2186     if (fParenStack.size() <= 0) {
2187         error(U_REGEX_MISMATCHED_PAREN);
2188         return;
2189     }
2190 
2191     // Emit code for any pending literals.
2192     fixLiterals(FALSE);
2193 
2194     // Fixup any operations within the just-closed parenthesized group
2195     //    that need to reference the end of the (block).
2196     //    (The first one popped from the stack is an unused slot for
2197     //     alternation (OR) state save, but applying the fixup to it does no harm.)
2198     for (;;) {
2199         patIdx = fParenStack.popi();
2200         if (patIdx < 0) {
2201             // value < 0 flags the start of the frame on the paren stack.
2202             break;
2203         }
2204         U_ASSERT(patIdx>0 && patIdx <= fRXPat->fCompiledPat->size());
2205         patOp = (int32_t)fRXPat->fCompiledPat->elementAti(patIdx);
2206         U_ASSERT(URX_VAL(patOp) == 0);          // Branch target for JMP should not be set.
2207         patOp |= fRXPat->fCompiledPat->size();  // Set it now.
2208         fRXPat->fCompiledPat->setElementAt(patOp, patIdx);
2209         fMatchOpenParen     = patIdx;
2210     }
2211 
2212     //  At the close of any parenthesized block, restore the match mode flags  to
2213     //  the value they had at the open paren.  Saved value is
2214     //  at the top of the paren stack.
2215     fModeFlags = fParenStack.popi();
2216     U_ASSERT(fModeFlags < 0);
2217 
2218     // DO any additional fixups, depending on the specific kind of
2219     // parentesized grouping this is
2220 
2221     switch (patIdx) {
2222     case plain:
2223     case flags:
2224         // No additional fixups required.
2225         //   (Grouping-only parentheses)
2226         break;
2227     case capturing:
2228         // Capturing Parentheses.
2229         //   Insert a End Capture op into the pattern.
2230         //   The frame offset of the variables for this cg is obtained from the
2231         //       start capture op and put it into the end-capture op.
2232         {
2233             int32_t   captureOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen+1);
2234             U_ASSERT(URX_TYPE(captureOp) == URX_START_CAPTURE);
2235 
2236             int32_t   frameVarLocation = URX_VAL(captureOp);
2237             appendOp(URX_END_CAPTURE, frameVarLocation);
2238         }
2239         break;
2240     case atomic:
2241         // Atomic Parenthesis.
2242         //   Insert a LD_SP operation to restore the state stack to the position
2243         //   it was when the atomic parens were entered.
2244         {
2245             int32_t   stoOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen+1);
2246             U_ASSERT(URX_TYPE(stoOp) == URX_STO_SP);
2247             int32_t   stoLoc = URX_VAL(stoOp);
2248             appendOp(URX_LD_SP, stoLoc);
2249         }
2250         break;
2251 
2252     case lookAhead:
2253         {
2254             int32_t  startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-5);
2255             U_ASSERT(URX_TYPE(startOp) == URX_LA_START);
2256             int32_t dataLoc  = URX_VAL(startOp);
2257             appendOp(URX_LA_END, dataLoc);
2258         }
2259         break;
2260 
2261     case negLookAhead:
2262         {
2263             // See comment at doOpenLookAheadNeg
2264             int32_t  startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-1);
2265             U_ASSERT(URX_TYPE(startOp) == URX_LA_START);
2266             int32_t dataLoc  = URX_VAL(startOp);
2267             appendOp(URX_LA_END, dataLoc);
2268             appendOp(URX_BACKTRACK, 0);
2269             appendOp(URX_LA_END, dataLoc);
2270 
2271             // Patch the URX_SAVE near the top of the block.
2272             // The destination of the SAVE is the final LA_END that was just added.
2273             int32_t saveOp   = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen);
2274             U_ASSERT(URX_TYPE(saveOp) == URX_STATE_SAVE);
2275             int32_t dest     = fRXPat->fCompiledPat->size()-1;
2276             saveOp           = buildOp(URX_STATE_SAVE, dest);
2277             fRXPat->fCompiledPat->setElementAt(saveOp, fMatchOpenParen);
2278         }
2279         break;
2280 
2281     case lookBehind:
2282         {
2283             // See comment at doOpenLookBehind.
2284 
2285             // Append the URX_LB_END and URX_LA_END to the compiled pattern.
2286             int32_t  startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-4);
2287             U_ASSERT(URX_TYPE(startOp) == URX_LB_START);
2288             int32_t dataLoc  = URX_VAL(startOp);
2289             appendOp(URX_LB_END, dataLoc);
2290             appendOp(URX_LA_END, dataLoc);
2291 
2292             // Determine the min and max bounds for the length of the
2293             //  string that the pattern can match.
2294             //  An unbounded upper limit is an error.
2295             int32_t patEnd   = fRXPat->fCompiledPat->size() - 1;
2296             int32_t minML    = minMatchLength(fMatchOpenParen, patEnd);
2297             int32_t maxML    = maxMatchLength(fMatchOpenParen, patEnd);
2298             if (URX_TYPE(maxML) != 0) {
2299                 error(U_REGEX_LOOK_BEHIND_LIMIT);
2300                 break;
2301             }
2302             if (maxML == INT32_MAX) {
2303                 error(U_REGEX_LOOK_BEHIND_LIMIT);
2304                 break;
2305             }
2306             if (minML == INT32_MAX) {
2307                 // This condition happens when no match is possible, such as with a
2308                 // [set] expression containing no elements.
2309                 // In principle, the generated code to evaluate the expression could be deleted,
2310                 // but it's probably not worth the complication.
2311                 minML = 0;
2312             }
2313             U_ASSERT(minML <= maxML);
2314 
2315             // Insert the min and max match len bounds into the URX_LB_CONT op that
2316             //  appears at the top of the look-behind block, at location fMatchOpenParen+1
2317             fRXPat->fCompiledPat->setElementAt(minML,  fMatchOpenParen-2);
2318             fRXPat->fCompiledPat->setElementAt(maxML,  fMatchOpenParen-1);
2319 
2320         }
2321         break;
2322 
2323 
2324 
2325     case lookBehindN:
2326         {
2327             // See comment at doOpenLookBehindNeg.
2328 
2329             // Append the URX_LBN_END to the compiled pattern.
2330             int32_t  startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-5);
2331             U_ASSERT(URX_TYPE(startOp) == URX_LB_START);
2332             int32_t dataLoc  = URX_VAL(startOp);
2333             appendOp(URX_LBN_END, dataLoc);
2334 
2335             // Determine the min and max bounds for the length of the
2336             //  string that the pattern can match.
2337             //  An unbounded upper limit is an error.
2338             int32_t patEnd   = fRXPat->fCompiledPat->size() - 1;
2339             int32_t minML    = minMatchLength(fMatchOpenParen, patEnd);
2340             int32_t maxML    = maxMatchLength(fMatchOpenParen, patEnd);
2341             if (URX_TYPE(maxML) != 0) {
2342                 error(U_REGEX_LOOK_BEHIND_LIMIT);
2343                 break;
2344             }
2345             if (maxML == INT32_MAX) {
2346                 error(U_REGEX_LOOK_BEHIND_LIMIT);
2347                 break;
2348             }
2349             if (minML == INT32_MAX) {
2350                 // This condition happens when no match is possible, such as with a
2351                 // [set] expression containing no elements.
2352                 // In principle, the generated code to evaluate the expression could be deleted,
2353                 // but it's probably not worth the complication.
2354                 minML = 0;
2355             }
2356 
2357             U_ASSERT(minML <= maxML);
2358 
2359             // Insert the min and max match len bounds into the URX_LB_CONT op that
2360             //  appears at the top of the look-behind block, at location fMatchOpenParen+1
2361             fRXPat->fCompiledPat->setElementAt(minML,  fMatchOpenParen-3);
2362             fRXPat->fCompiledPat->setElementAt(maxML,  fMatchOpenParen-2);
2363 
2364             // Insert the pattern location to continue at after a successful match
2365             //  as the last operand of the URX_LBN_CONT
2366             int32_t op = buildOp(URX_RELOC_OPRND, fRXPat->fCompiledPat->size());
2367             fRXPat->fCompiledPat->setElementAt(op,  fMatchOpenParen-1);
2368         }
2369         break;
2370 
2371 
2372 
2373     default:
2374         UPRV_UNREACHABLE_EXIT;
2375     }
2376 
2377     // remember the next location in the compiled pattern.
2378     // The compilation of Quantifiers will look at this to see whether its looping
2379     //   over a parenthesized block or a single item
2380     fMatchCloseParen = fRXPat->fCompiledPat->size();
2381 }
2382 
2383 
2384 
2385 //------------------------------------------------------------------------------
2386 //
2387 //   compileSet       Compile the pattern operations for a reference to a
2388 //                    UnicodeSet.
2389 //
2390 //------------------------------------------------------------------------------
compileSet(UnicodeSet * theSet)2391 void        RegexCompile::compileSet(UnicodeSet *theSet)
2392 {
2393     if (theSet == NULL) {
2394         return;
2395     }
2396     //  Remove any strings from the set.
2397     //  There shouldn't be any, but just in case.
2398     //     (Case Closure can add them; if we had a simple case closure available that
2399     //      ignored strings, that would be better.)
2400     theSet->removeAllStrings();
2401     int32_t  setSize = theSet->size();
2402 
2403     switch (setSize) {
2404     case 0:
2405         {
2406             // Set of no elements.   Always fails to match.
2407             appendOp(URX_BACKTRACK, 0);
2408             delete theSet;
2409         }
2410         break;
2411 
2412     case 1:
2413         {
2414             // The set contains only a single code point.  Put it into
2415             //   the compiled pattern as a single char operation rather
2416             //   than a set, and discard the set itself.
2417             literalChar(theSet->charAt(0));
2418             delete theSet;
2419         }
2420         break;
2421 
2422     default:
2423         {
2424             //  The set contains two or more chars.  (the normal case)
2425             //  Put it into the compiled pattern as a set.
2426             theSet->freeze();
2427             int32_t setNumber = fRXPat->fSets->size();
2428             fRXPat->fSets->addElement(theSet, *fStatus);
2429             if (U_SUCCESS(*fStatus)) {
2430                 appendOp(URX_SETREF, setNumber);
2431             } else {
2432                 delete theSet;
2433             }
2434         }
2435     }
2436 }
2437 
2438 
2439 //------------------------------------------------------------------------------
2440 //
2441 //   compileInterval    Generate the code for a {min, max} style interval quantifier.
2442 //                      Except for the specific opcodes used, the code is the same
2443 //                      for all three types (greedy, non-greedy, possessive) of
2444 //                      intervals.  The opcodes are supplied as parameters.
2445 //                      (There are two sets of opcodes - greedy & possessive use the
2446 //                      same ones, while non-greedy has it's own.)
2447 //
2448 //                      The code for interval loops has this form:
2449 //                         0  CTR_INIT   counter loc (in stack frame)
2450 //                         1             5  patt address of CTR_LOOP at bottom of block
2451 //                         2             min count
2452 //                         3             max count   (-1 for unbounded)
2453 //                         4  ...        block to be iterated over
2454 //                         5  CTR_LOOP
2455 //
2456 //                       In
2457 //------------------------------------------------------------------------------
compileInterval(int32_t InitOp,int32_t LoopOp)2458 void        RegexCompile::compileInterval(int32_t InitOp,  int32_t LoopOp)
2459 {
2460     // The CTR_INIT op at the top of the block with the {n,m} quantifier takes
2461     //   four slots in the compiled code.  Reserve them.
2462     int32_t   topOfBlock = blockTopLoc(TRUE);
2463     insertOp(topOfBlock);
2464     insertOp(topOfBlock);
2465     insertOp(topOfBlock);
2466 
2467     // The operands for the CTR_INIT opcode include the index in the matcher data
2468     //   of the counter.  Allocate it now. There are two data items
2469     //        counterLoc   -->  Loop counter
2470     //               +1    -->  Input index (for breaking non-progressing loops)
2471     //                          (Only present if unbounded upper limit on loop)
2472     int32_t   dataSize = fIntervalUpper < 0 ? 2 : 1;
2473     int32_t   counterLoc = allocateStackData(dataSize);
2474 
2475     int32_t   op = buildOp(InitOp, counterLoc);
2476     fRXPat->fCompiledPat->setElementAt(op, topOfBlock);
2477 
2478     // The second operand of CTR_INIT is the location following the end of the loop.
2479     //   Must put in as a URX_RELOC_OPRND so that the value will be adjusted if the
2480     //   compilation of something later on causes the code to grow and the target
2481     //   position to move.
2482     int32_t loopEnd = fRXPat->fCompiledPat->size();
2483     op = buildOp(URX_RELOC_OPRND, loopEnd);
2484     fRXPat->fCompiledPat->setElementAt(op, topOfBlock+1);
2485 
2486     // Followed by the min and max counts.
2487     fRXPat->fCompiledPat->setElementAt(fIntervalLow, topOfBlock+2);
2488     fRXPat->fCompiledPat->setElementAt(fIntervalUpper, topOfBlock+3);
2489 
2490     // Append the CTR_LOOP op.  The operand is the location of the CTR_INIT op.
2491     //   Goes at end of the block being looped over, so just append to the code so far.
2492     appendOp(LoopOp, topOfBlock);
2493 
2494     if ((fIntervalLow & 0xff000000) != 0 ||
2495         (fIntervalUpper > 0 && (fIntervalUpper & 0xff000000) != 0)) {
2496             error(U_REGEX_NUMBER_TOO_BIG);
2497         }
2498 
2499     if (fIntervalLow > fIntervalUpper && fIntervalUpper != -1) {
2500         error(U_REGEX_MAX_LT_MIN);
2501     }
2502 }
2503 
2504 
2505 
compileInlineInterval()2506 UBool RegexCompile::compileInlineInterval() {
2507     if (fIntervalUpper > 10 || fIntervalUpper < fIntervalLow) {
2508         // Too big to inline.  Fail, which will cause looping code to be generated.
2509         //   (Upper < Lower picks up unbounded upper and errors, both.)
2510         return FALSE;
2511     }
2512 
2513     int32_t   topOfBlock = blockTopLoc(FALSE);
2514     if (fIntervalUpper == 0) {
2515         // Pathological case.  Attempt no matches, as if the block doesn't exist.
2516         // Discard the generated code for the block.
2517         // If the block included parens, discard the info pertaining to them as well.
2518         fRXPat->fCompiledPat->setSize(topOfBlock);
2519         if (fMatchOpenParen >= topOfBlock) {
2520             fMatchOpenParen = -1;
2521         }
2522         if (fMatchCloseParen >= topOfBlock) {
2523             fMatchCloseParen = -1;
2524         }
2525         return TRUE;
2526     }
2527 
2528     if (topOfBlock != fRXPat->fCompiledPat->size()-1 && fIntervalUpper != 1) {
2529         // The thing being repeated is not a single op, but some
2530         //   more complex block.  Do it as a loop, not inlines.
2531         //   Note that things "repeated" a max of once are handled as inline, because
2532         //     the one copy of the code already generated is just fine.
2533         return FALSE;
2534     }
2535 
2536     // Pick up the opcode that is to be repeated
2537     //
2538     int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(topOfBlock);
2539 
2540     // Compute the pattern location where the inline sequence
2541     //   will end, and set up the state save op that will be needed.
2542     //
2543     int32_t endOfSequenceLoc = fRXPat->fCompiledPat->size()-1
2544                                 + fIntervalUpper + (fIntervalUpper-fIntervalLow);
2545     int32_t saveOp = buildOp(URX_STATE_SAVE, endOfSequenceLoc);
2546     if (fIntervalLow == 0) {
2547         insertOp(topOfBlock);
2548         fRXPat->fCompiledPat->setElementAt(saveOp, topOfBlock);
2549     }
2550 
2551 
2552 
2553     //  Loop, emitting the op for the thing being repeated each time.
2554     //    Loop starts at 1 because one instance of the op already exists in the pattern,
2555     //    it was put there when it was originally encountered.
2556     int32_t i;
2557     for (i=1; i<fIntervalUpper; i++ ) {
2558         if (i >= fIntervalLow) {
2559             appendOp(saveOp);
2560         }
2561         appendOp(op);
2562     }
2563     return TRUE;
2564 }
2565 
2566 
2567 
2568 //------------------------------------------------------------------------------
2569 //
2570 //   caseInsensitiveStart  given a single code point from a pattern string, determine the
2571 //                         set of characters that could potentially begin a case-insensitive
2572 //                         match of a string beginning with that character, using full Unicode
2573 //                         case insensitive matching.
2574 //
2575 //          This is used in optimizing find().
2576 //
2577 //          closeOver(USET_CASE_INSENSITIVE) does most of what is needed, but
2578 //          misses cases like this:
2579 //             A string from the pattern begins with 'ss' (although all we know
2580 //                 in this context is that it begins with 's')
2581 //             The pattern could match a string beginning with a German sharp-s
2582 //
2583 //           To the ordinary case closure for a character c, we add all other
2584 //           characters cx where the case closure of cx includes a string form that begins
2585 //           with the original character c.
2586 //
2587 //           This function could be made smarter. The full pattern string is available
2588 //           and it would be possible to verify that the extra characters being added
2589 //           to the starting set fully match, rather than having just a first-char of the
2590 //           folded form match.
2591 //
2592 //------------------------------------------------------------------------------
findCaseInsensitiveStarters(UChar32 c,UnicodeSet * starterChars)2593 void  RegexCompile::findCaseInsensitiveStarters(UChar32 c, UnicodeSet *starterChars) {
2594 
2595 // Machine Generated below.
2596 // It may need updating with new versions of Unicode.
2597 // Intltest test RegexTest::TestCaseInsensitiveStarters will fail if an update is needed.
2598 // The update tool is here:
2599 // https://github.com/unicode-org/icu/tree/main/tools/unicode/c/genregexcasing
2600 
2601 // Machine Generated Data. Do not hand edit.
2602     static const UChar32 RECaseFixCodePoints[] = {
2603         0x61, 0x66, 0x68, 0x69, 0x6a, 0x73, 0x74, 0x77, 0x79, 0x2bc,
2604         0x3ac, 0x3ae, 0x3b1, 0x3b7, 0x3b9, 0x3c1, 0x3c5, 0x3c9, 0x3ce, 0x565,
2605         0x574, 0x57e, 0x1f00, 0x1f01, 0x1f02, 0x1f03, 0x1f04, 0x1f05, 0x1f06, 0x1f07,
2606         0x1f20, 0x1f21, 0x1f22, 0x1f23, 0x1f24, 0x1f25, 0x1f26, 0x1f27, 0x1f60, 0x1f61,
2607         0x1f62, 0x1f63, 0x1f64, 0x1f65, 0x1f66, 0x1f67, 0x1f70, 0x1f74, 0x1f7c, 0x110000};
2608 
2609     static const int16_t RECaseFixStringOffsets[] = {
2610         0x0, 0x1, 0x6, 0x7, 0x8, 0x9, 0xd, 0xe, 0xf, 0x10,
2611         0x11, 0x12, 0x13, 0x17, 0x1b, 0x20, 0x21, 0x2a, 0x2e, 0x2f,
2612         0x30, 0x34, 0x35, 0x37, 0x39, 0x3b, 0x3d, 0x3f, 0x41, 0x43,
2613         0x45, 0x47, 0x49, 0x4b, 0x4d, 0x4f, 0x51, 0x53, 0x55, 0x57,
2614         0x59, 0x5b, 0x5d, 0x5f, 0x61, 0x63, 0x65, 0x66, 0x67, 0};
2615 
2616     static const int16_t RECaseFixCounts[] = {
2617         0x1, 0x5, 0x1, 0x1, 0x1, 0x4, 0x1, 0x1, 0x1, 0x1,
2618         0x1, 0x1, 0x4, 0x4, 0x5, 0x1, 0x9, 0x4, 0x1, 0x1,
2619         0x4, 0x1, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2,
2620         0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2,
2621         0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x1, 0x1, 0x1, 0};
2622 
2623     static const UChar RECaseFixData[] = {
2624         0x1e9a, 0xfb00, 0xfb01, 0xfb02, 0xfb03, 0xfb04, 0x1e96, 0x130, 0x1f0, 0xdf,
2625         0x1e9e, 0xfb05, 0xfb06, 0x1e97, 0x1e98, 0x1e99, 0x149, 0x1fb4, 0x1fc4, 0x1fb3,
2626         0x1fb6, 0x1fb7, 0x1fbc, 0x1fc3, 0x1fc6, 0x1fc7, 0x1fcc, 0x390, 0x1fd2, 0x1fd3,
2627         0x1fd6, 0x1fd7, 0x1fe4, 0x3b0, 0x1f50, 0x1f52, 0x1f54, 0x1f56, 0x1fe2, 0x1fe3,
2628         0x1fe6, 0x1fe7, 0x1ff3, 0x1ff6, 0x1ff7, 0x1ffc, 0x1ff4, 0x587, 0xfb13, 0xfb14,
2629         0xfb15, 0xfb17, 0xfb16, 0x1f80, 0x1f88, 0x1f81, 0x1f89, 0x1f82, 0x1f8a, 0x1f83,
2630         0x1f8b, 0x1f84, 0x1f8c, 0x1f85, 0x1f8d, 0x1f86, 0x1f8e, 0x1f87, 0x1f8f, 0x1f90,
2631         0x1f98, 0x1f91, 0x1f99, 0x1f92, 0x1f9a, 0x1f93, 0x1f9b, 0x1f94, 0x1f9c, 0x1f95,
2632         0x1f9d, 0x1f96, 0x1f9e, 0x1f97, 0x1f9f, 0x1fa0, 0x1fa8, 0x1fa1, 0x1fa9, 0x1fa2,
2633         0x1faa, 0x1fa3, 0x1fab, 0x1fa4, 0x1fac, 0x1fa5, 0x1fad, 0x1fa6, 0x1fae, 0x1fa7,
2634         0x1faf, 0x1fb2, 0x1fc2, 0x1ff2, 0};
2635 
2636 // End of machine generated data.
2637 
2638     if (c < UCHAR_MIN_VALUE || c > UCHAR_MAX_VALUE) {
2639         // This function should never be called with an invalid input character.
2640         UPRV_UNREACHABLE_EXIT;
2641     } else if (u_hasBinaryProperty(c, UCHAR_CASE_SENSITIVE)) {
2642         UChar32 caseFoldedC  = u_foldCase(c, U_FOLD_CASE_DEFAULT);
2643         starterChars->set(caseFoldedC, caseFoldedC);
2644 
2645         int32_t i;
2646         for (i=0; RECaseFixCodePoints[i]<c ; i++) {
2647             // Simple linear search through the sorted list of interesting code points.
2648         }
2649 
2650         if (RECaseFixCodePoints[i] == c) {
2651             int32_t dataIndex = RECaseFixStringOffsets[i];
2652             int32_t numCharsToAdd = RECaseFixCounts[i];
2653             UChar32 cpToAdd = 0;
2654             for (int32_t j=0; j<numCharsToAdd; j++) {
2655                 U16_NEXT_UNSAFE(RECaseFixData, dataIndex, cpToAdd);
2656                 starterChars->add(cpToAdd);
2657             }
2658         }
2659 
2660         starterChars->closeOver(USET_CASE_INSENSITIVE);
2661         starterChars->removeAllStrings();
2662     } else {
2663         // Not a cased character. Just return it alone.
2664         starterChars->set(c, c);
2665     }
2666 }
2667 
2668 
2669 // Increment with overflow check.
2670 // val and delta will both be positive.
2671 
safeIncrement(int32_t val,int32_t delta)2672 static int32_t safeIncrement(int32_t val, int32_t delta) {
2673     if (INT32_MAX - val > delta) {
2674         return val + delta;
2675     } else {
2676         return INT32_MAX;
2677     }
2678 }
2679 
2680 
2681 //------------------------------------------------------------------------------
2682 //
2683 //   matchStartType    Determine how a match can start.
2684 //                     Used to optimize find() operations.
2685 //
2686 //                     Operation is very similar to minMatchLength().  Walk the compiled
2687 //                     pattern, keeping an on-going minimum-match-length.  For any
2688 //                     op where the min match coming in is zero, add that ops possible
2689 //                     starting matches to the possible starts for the overall pattern.
2690 //
2691 //------------------------------------------------------------------------------
matchStartType()2692 void   RegexCompile::matchStartType() {
2693     if (U_FAILURE(*fStatus)) {
2694         return;
2695     }
2696 
2697 
2698     int32_t    loc;                    // Location in the pattern of the current op being processed.
2699     int32_t    op;                     // The op being processed
2700     int32_t    opType;                 // The opcode type of the op
2701     int32_t    currentLen = 0;         // Minimum length of a match to this point (loc) in the pattern
2702     int32_t    numInitialStrings = 0;  // Number of strings encountered that could match at start.
2703 
2704     UBool      atStart = TRUE;         // True if no part of the pattern yet encountered
2705                                        //   could have advanced the position in a match.
2706                                        //   (Maximum match length so far == 0)
2707 
2708     // forwardedLength is a vector holding minimum-match-length values that
2709     //   are propagated forward in the pattern by JMP or STATE_SAVE operations.
2710     //   It must be one longer than the pattern being checked because some  ops
2711     //   will jmp to a end-of-block+1 location from within a block, and we must
2712     //   count those when checking the block.
2713     int32_t end = fRXPat->fCompiledPat->size();
2714     UVector32  forwardedLength(end+1, *fStatus);
2715     forwardedLength.setSize(end+1);
2716     for (loc=3; loc<end; loc++) {
2717         forwardedLength.setElementAt(INT32_MAX, loc);
2718     }
2719 
2720     for (loc = 3; loc<end; loc++) {
2721         op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
2722         opType = URX_TYPE(op);
2723 
2724         // The loop is advancing linearly through the pattern.
2725         // If the op we are now at was the destination of a branch in the pattern,
2726         // and that path has a shorter minimum length than the current accumulated value,
2727         // replace the current accumulated value.
2728         if (forwardedLength.elementAti(loc) < currentLen) {
2729             currentLen = forwardedLength.elementAti(loc);
2730             U_ASSERT(currentLen>=0 && currentLen < INT32_MAX);
2731         }
2732 
2733         switch (opType) {
2734             // Ops that don't change the total length matched
2735         case URX_RESERVED_OP:
2736         case URX_END:
2737         case URX_FAIL:
2738         case URX_STRING_LEN:
2739         case URX_NOP:
2740         case URX_START_CAPTURE:
2741         case URX_END_CAPTURE:
2742         case URX_BACKSLASH_B:
2743         case URX_BACKSLASH_BU:
2744         case URX_BACKSLASH_G:
2745         case URX_BACKSLASH_Z:
2746         case URX_DOLLAR:
2747         case URX_DOLLAR_M:
2748         case URX_DOLLAR_D:
2749         case URX_DOLLAR_MD:
2750         case URX_RELOC_OPRND:
2751         case URX_STO_INP_LOC:
2752         case URX_BACKREF:         // BackRef.  Must assume that it might be a zero length match
2753         case URX_BACKREF_I:
2754 
2755         case URX_STO_SP:          // Setup for atomic or possessive blocks.  Doesn't change what can match.
2756         case URX_LD_SP:
2757             break;
2758 
2759         case URX_CARET:
2760             if (atStart) {
2761                 fRXPat->fStartType = START_START;
2762             }
2763             break;
2764 
2765         case URX_CARET_M:
2766         case URX_CARET_M_UNIX:
2767             if (atStart) {
2768                 fRXPat->fStartType = START_LINE;
2769             }
2770             break;
2771 
2772         case URX_ONECHAR:
2773             if (currentLen == 0) {
2774                 // This character could appear at the start of a match.
2775                 //   Add it to the set of possible starting characters.
2776                 fRXPat->fInitialChars->add(URX_VAL(op));
2777                 numInitialStrings += 2;
2778             }
2779             currentLen = safeIncrement(currentLen, 1);
2780             atStart = FALSE;
2781             break;
2782 
2783 
2784         case URX_SETREF:
2785             if (currentLen == 0) {
2786                 int32_t  sn = URX_VAL(op);
2787                 U_ASSERT(sn > 0 && sn < fRXPat->fSets->size());
2788                 const UnicodeSet *s = (UnicodeSet *)fRXPat->fSets->elementAt(sn);
2789                 fRXPat->fInitialChars->addAll(*s);
2790                 numInitialStrings += 2;
2791             }
2792             currentLen = safeIncrement(currentLen, 1);
2793             atStart = FALSE;
2794             break;
2795 
2796         case URX_LOOP_SR_I:
2797             // [Set]*, like a SETREF, above, in what it can match,
2798             //  but may not match at all, so currentLen is not incremented.
2799             if (currentLen == 0) {
2800                 int32_t  sn = URX_VAL(op);
2801                 U_ASSERT(sn > 0 && sn < fRXPat->fSets->size());
2802                 const UnicodeSet *s = (UnicodeSet *)fRXPat->fSets->elementAt(sn);
2803                 fRXPat->fInitialChars->addAll(*s);
2804                 numInitialStrings += 2;
2805             }
2806             atStart = FALSE;
2807             break;
2808 
2809         case URX_LOOP_DOT_I:
2810             if (currentLen == 0) {
2811                 // .* at the start of a pattern.
2812                 //    Any character can begin the match.
2813                 fRXPat->fInitialChars->clear();
2814                 fRXPat->fInitialChars->complement();
2815                 numInitialStrings += 2;
2816             }
2817             atStart = FALSE;
2818             break;
2819 
2820 
2821         case URX_STATIC_SETREF:
2822             if (currentLen == 0) {
2823                 int32_t  sn = URX_VAL(op);
2824                 U_ASSERT(sn>0 && sn<URX_LAST_SET);
2825                 const UnicodeSet &s = RegexStaticSets::gStaticSets->fPropSets[sn];
2826                 fRXPat->fInitialChars->addAll(s);
2827                 numInitialStrings += 2;
2828             }
2829             currentLen = safeIncrement(currentLen, 1);
2830             atStart = FALSE;
2831             break;
2832 
2833 
2834 
2835         case URX_STAT_SETREF_N:
2836             if (currentLen == 0) {
2837                 int32_t  sn = URX_VAL(op);
2838                 UnicodeSet sc;
2839                 sc.addAll(RegexStaticSets::gStaticSets->fPropSets[sn]).complement();
2840                 fRXPat->fInitialChars->addAll(sc);
2841                 numInitialStrings += 2;
2842             }
2843             currentLen = safeIncrement(currentLen, 1);
2844             atStart = FALSE;
2845             break;
2846 
2847 
2848 
2849         case URX_BACKSLASH_D:
2850             // Digit Char
2851              if (currentLen == 0) {
2852                  UnicodeSet s;
2853                  s.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ND_MASK, *fStatus);
2854                  if (URX_VAL(op) != 0) {
2855                      s.complement();
2856                  }
2857                  fRXPat->fInitialChars->addAll(s);
2858                  numInitialStrings += 2;
2859             }
2860             currentLen = safeIncrement(currentLen, 1);
2861             atStart = FALSE;
2862             break;
2863 
2864 
2865         case URX_BACKSLASH_H:
2866             // Horiz white space
2867             if (currentLen == 0) {
2868                 UnicodeSet s;
2869                 s.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ZS_MASK, *fStatus);
2870                 s.add((UChar32)9);   // Tab
2871                 if (URX_VAL(op) != 0) {
2872                     s.complement();
2873                 }
2874                 fRXPat->fInitialChars->addAll(s);
2875                 numInitialStrings += 2;
2876             }
2877             currentLen = safeIncrement(currentLen, 1);
2878             atStart = FALSE;
2879             break;
2880 
2881 
2882         case URX_BACKSLASH_R:       // Any line ending sequence
2883         case URX_BACKSLASH_V:       // Any line ending code point, with optional negation
2884             if (currentLen == 0) {
2885                 UnicodeSet s;
2886                 s.add((UChar32)0x0a, (UChar32)0x0d);  // add range
2887                 s.add((UChar32)0x85);
2888                 s.add((UChar32)0x2028, (UChar32)0x2029);
2889                 if (URX_VAL(op) != 0) {
2890                      // Complement option applies to URX_BACKSLASH_V only.
2891                      s.complement();
2892                 }
2893                 fRXPat->fInitialChars->addAll(s);
2894                 numInitialStrings += 2;
2895             }
2896             currentLen = safeIncrement(currentLen, 1);
2897             atStart = FALSE;
2898             break;
2899 
2900 
2901 
2902         case URX_ONECHAR_I:
2903             // Case Insensitive Single Character.
2904             if (currentLen == 0) {
2905                 UChar32  c = URX_VAL(op);
2906                 if (u_hasBinaryProperty(c, UCHAR_CASE_SENSITIVE)) {
2907                     UnicodeSet starters(c, c);
2908                     starters.closeOver(USET_CASE_INSENSITIVE);
2909                     // findCaseInsensitiveStarters(c, &starters);
2910                     //   For ONECHAR_I, no need to worry about text chars that expand on folding into strings.
2911                     //   The expanded folding can't match the pattern.
2912                     fRXPat->fInitialChars->addAll(starters);
2913                 } else {
2914                     // Char has no case variants.  Just add it as-is to the
2915                     //   set of possible starting chars.
2916                     fRXPat->fInitialChars->add(c);
2917                 }
2918                 numInitialStrings += 2;
2919             }
2920             currentLen = safeIncrement(currentLen, 1);
2921             atStart = FALSE;
2922             break;
2923 
2924 
2925         case URX_BACKSLASH_X:   // Grapheme Cluster.  Minimum is 1, max unbounded.
2926         case URX_DOTANY_ALL:    // . matches one or two.
2927         case URX_DOTANY:
2928         case URX_DOTANY_UNIX:
2929             if (currentLen == 0) {
2930                 // These constructs are all bad news when they appear at the start
2931                 //   of a match.  Any character can begin the match.
2932                 fRXPat->fInitialChars->clear();
2933                 fRXPat->fInitialChars->complement();
2934                 numInitialStrings += 2;
2935             }
2936             currentLen = safeIncrement(currentLen, 1);
2937             atStart = FALSE;
2938             break;
2939 
2940 
2941         case URX_JMPX:
2942             loc++;             // Except for extra operand on URX_JMPX, same as URX_JMP.
2943             U_FALLTHROUGH;
2944         case URX_JMP:
2945             {
2946                 int32_t  jmpDest = URX_VAL(op);
2947                 if (jmpDest < loc) {
2948                     // Loop of some kind.  Can safely ignore, the worst that will happen
2949                     //  is that we understate the true minimum length
2950                     currentLen = forwardedLength.elementAti(loc+1);
2951 
2952                 } else {
2953                     // Forward jump.  Propagate the current min length to the target loc of the jump.
2954                     U_ASSERT(jmpDest <= end+1);
2955                     if (forwardedLength.elementAti(jmpDest) > currentLen) {
2956                         forwardedLength.setElementAt(currentLen, jmpDest);
2957                     }
2958                 }
2959             }
2960             atStart = FALSE;
2961             break;
2962 
2963         case URX_JMP_SAV:
2964         case URX_JMP_SAV_X:
2965             // Combo of state save to the next loc, + jmp backwards.
2966             //   Net effect on min. length computation is nothing.
2967             atStart = FALSE;
2968             break;
2969 
2970         case URX_BACKTRACK:
2971             // Fails are kind of like a branch, except that the min length was
2972             //   propagated already, by the state save.
2973             currentLen = forwardedLength.elementAti(loc+1);
2974             atStart = FALSE;
2975             break;
2976 
2977 
2978         case URX_STATE_SAVE:
2979             {
2980                 // State Save, for forward jumps, propagate the current minimum.
2981                 //             of the state save.
2982                 int32_t  jmpDest = URX_VAL(op);
2983                 if (jmpDest > loc) {
2984                     if (currentLen < forwardedLength.elementAti(jmpDest)) {
2985                         forwardedLength.setElementAt(currentLen, jmpDest);
2986                     }
2987                 }
2988             }
2989             atStart = FALSE;
2990             break;
2991 
2992 
2993 
2994 
2995         case URX_STRING:
2996             {
2997                 loc++;
2998                 int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
2999                 int32_t stringLen   = URX_VAL(stringLenOp);
3000                 U_ASSERT(URX_TYPE(stringLenOp) == URX_STRING_LEN);
3001                 U_ASSERT(stringLenOp >= 2);
3002                 if (currentLen == 0) {
3003                     // Add the starting character of this string to the set of possible starting
3004                     //   characters for this pattern.
3005                     int32_t stringStartIdx = URX_VAL(op);
3006                     UChar32  c = fRXPat->fLiteralText.char32At(stringStartIdx);
3007                     fRXPat->fInitialChars->add(c);
3008 
3009                     // Remember this string.  After the entire pattern has been checked,
3010                     //  if nothing else is identified that can start a match, we'll use it.
3011                     numInitialStrings++;
3012                     fRXPat->fInitialStringIdx = stringStartIdx;
3013                     fRXPat->fInitialStringLen = stringLen;
3014                 }
3015 
3016                 currentLen = safeIncrement(currentLen, stringLen);
3017                 atStart = FALSE;
3018             }
3019             break;
3020 
3021         case URX_STRING_I:
3022             {
3023                 // Case-insensitive string.  Unlike exact-match strings, we won't
3024                 //   attempt a string search for possible match positions.  But we
3025                 //   do update the set of possible starting characters.
3026                 loc++;
3027                 int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3028                 int32_t stringLen   = URX_VAL(stringLenOp);
3029                 U_ASSERT(URX_TYPE(stringLenOp) == URX_STRING_LEN);
3030                 U_ASSERT(stringLenOp >= 2);
3031                 if (currentLen == 0) {
3032                     // Add the starting character of this string to the set of possible starting
3033                     //   characters for this pattern.
3034                     int32_t stringStartIdx = URX_VAL(op);
3035                     UChar32  c = fRXPat->fLiteralText.char32At(stringStartIdx);
3036                     UnicodeSet s;
3037                     findCaseInsensitiveStarters(c, &s);
3038                     fRXPat->fInitialChars->addAll(s);
3039                     numInitialStrings += 2;  // Matching on an initial string not possible.
3040                 }
3041                 currentLen = safeIncrement(currentLen, stringLen);
3042                 atStart = FALSE;
3043             }
3044             break;
3045 
3046         case URX_CTR_INIT:
3047         case URX_CTR_INIT_NG:
3048             {
3049                 // Loop Init Ops.  These don't change the min length, but they are 4 word ops
3050                 //   so location must be updated accordingly.
3051                 // Loop Init Ops.
3052                 //   If the min loop count == 0
3053                 //      move loc forwards to the end of the loop, skipping over the body.
3054                 //   If the min count is > 0,
3055                 //      continue normal processing of the body of the loop.
3056                 int32_t loopEndLoc   = (int32_t)fRXPat->fCompiledPat->elementAti(loc+1);
3057                         loopEndLoc   = URX_VAL(loopEndLoc);
3058                 int32_t minLoopCount = (int32_t)fRXPat->fCompiledPat->elementAti(loc+2);
3059                 if (minLoopCount == 0) {
3060                     // Min Loop Count of 0, treat like a forward branch and
3061                     //   move the current minimum length up to the target
3062                     //   (end of loop) location.
3063                     U_ASSERT(loopEndLoc <= end+1);
3064                     if (forwardedLength.elementAti(loopEndLoc) > currentLen) {
3065                         forwardedLength.setElementAt(currentLen, loopEndLoc);
3066                     }
3067                 }
3068                 loc+=3;  // Skips over operands of CTR_INIT
3069             }
3070             atStart = FALSE;
3071             break;
3072 
3073 
3074         case URX_CTR_LOOP:
3075         case URX_CTR_LOOP_NG:
3076             // Loop ops.
3077             //  The jump is conditional, backwards only.
3078             atStart = FALSE;
3079             break;
3080 
3081         case URX_LOOP_C:
3082             // More loop ops.  These state-save to themselves.
3083             //   don't change the minimum match
3084             atStart = FALSE;
3085             break;
3086 
3087 
3088         case URX_LA_START:
3089         case URX_LB_START:
3090             {
3091                 // Look-around.  Scan forward until the matching look-ahead end,
3092                 //   without processing the look-around block.  This is overly pessimistic.
3093 
3094                 // Keep track of the nesting depth of look-around blocks.  Boilerplate code for
3095                 //   lookahead contains two LA_END instructions, so count goes up by two
3096                 //   for each LA_START.
3097                 int32_t  depth = (opType == URX_LA_START? 2: 1);
3098                 for (;;) {
3099                     loc++;
3100                     op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3101                     if (URX_TYPE(op) == URX_LA_START) {
3102                         depth+=2;
3103                     }
3104                     if (URX_TYPE(op) == URX_LB_START) {
3105                         depth++;
3106                     }
3107                     if (URX_TYPE(op) == URX_LA_END || URX_TYPE(op)==URX_LBN_END) {
3108                         depth--;
3109                         if (depth == 0) {
3110                             break;
3111                         }
3112                     }
3113                     if (URX_TYPE(op) == URX_STATE_SAVE) {
3114                         // Need this because neg lookahead blocks will FAIL to outside
3115                         //   of the block.
3116                         int32_t  jmpDest = URX_VAL(op);
3117                         if (jmpDest > loc) {
3118                             if (currentLen < forwardedLength.elementAti(jmpDest)) {
3119                                 forwardedLength.setElementAt(currentLen, jmpDest);
3120                             }
3121                         }
3122                     }
3123                     U_ASSERT(loc <= end);
3124                 }
3125             }
3126             break;
3127 
3128         case URX_LA_END:
3129         case URX_LB_CONT:
3130         case URX_LB_END:
3131         case URX_LBN_CONT:
3132         case URX_LBN_END:
3133             UPRV_UNREACHABLE_EXIT;  // Shouldn't get here.  These ops should be
3134                                     //  consumed by the scan in URX_LA_START and LB_START
3135         default:
3136             UPRV_UNREACHABLE_EXIT;
3137             }
3138 
3139         }
3140 
3141 
3142     // We have finished walking through the ops.  Check whether some forward jump
3143     //   propagated a shorter length to location end+1.
3144     if (forwardedLength.elementAti(end+1) < currentLen) {
3145         currentLen = forwardedLength.elementAti(end+1);
3146     }
3147 
3148 
3149     fRXPat->fInitialChars8->init(fRXPat->fInitialChars);
3150 
3151 
3152     // Sort out what we should check for when looking for candidate match start positions.
3153     // In order of preference,
3154     //     1.   Start of input text buffer.
3155     //     2.   A literal string.
3156     //     3.   Start of line in multi-line mode.
3157     //     4.   A single literal character.
3158     //     5.   A character from a set of characters.
3159     //
3160     if (fRXPat->fStartType == START_START) {
3161         // Match only at the start of an input text string.
3162         //    start type is already set.  We're done.
3163     } else if (numInitialStrings == 1 && fRXPat->fMinMatchLen > 0) {
3164         // Match beginning only with a literal string.
3165         UChar32  c = fRXPat->fLiteralText.char32At(fRXPat->fInitialStringIdx);
3166         U_ASSERT(fRXPat->fInitialChars->contains(c));
3167         fRXPat->fStartType   = START_STRING;
3168         fRXPat->fInitialChar = c;
3169     } else if (fRXPat->fStartType == START_LINE) {
3170         // Match at start of line in Multi-Line mode.
3171         // Nothing to do here; everything is already set.
3172     } else if (fRXPat->fMinMatchLen == 0) {
3173         // Zero length match possible.  We could start anywhere.
3174         fRXPat->fStartType = START_NO_INFO;
3175     } else if (fRXPat->fInitialChars->size() == 1) {
3176         // All matches begin with the same char.
3177         fRXPat->fStartType   = START_CHAR;
3178         fRXPat->fInitialChar = fRXPat->fInitialChars->charAt(0);
3179         U_ASSERT(fRXPat->fInitialChar != (UChar32)-1);
3180     } else if (fRXPat->fInitialChars->contains((UChar32)0, (UChar32)0x10ffff) == FALSE &&
3181         fRXPat->fMinMatchLen > 0) {
3182         // Matches start with a set of character smaller than the set of all chars.
3183         fRXPat->fStartType = START_SET;
3184     } else {
3185         // Matches can start with anything
3186         fRXPat->fStartType = START_NO_INFO;
3187     }
3188 
3189     return;
3190 }
3191 
3192 
3193 
3194 //------------------------------------------------------------------------------
3195 //
3196 //   minMatchLength    Calculate the length of the shortest string that could
3197 //                     match the specified pattern.
3198 //                     Length is in 16 bit code units, not code points.
3199 //
3200 //                     The calculated length may not be exact.  The returned
3201 //                     value may be shorter than the actual minimum; it must
3202 //                     never be longer.
3203 //
3204 //                     start and end are the range of p-code operations to be
3205 //                     examined.  The endpoints are included in the range.
3206 //
3207 //------------------------------------------------------------------------------
minMatchLength(int32_t start,int32_t end)3208 int32_t   RegexCompile::minMatchLength(int32_t start, int32_t end) {
3209     if (U_FAILURE(*fStatus)) {
3210         return 0;
3211     }
3212 
3213     U_ASSERT(start <= end);
3214     U_ASSERT(end < fRXPat->fCompiledPat->size());
3215 
3216 
3217     int32_t    loc;
3218     int32_t    op;
3219     int32_t    opType;
3220     int32_t    currentLen = 0;
3221 
3222 
3223     // forwardedLength is a vector holding minimum-match-length values that
3224     //   are propagated forward in the pattern by JMP or STATE_SAVE operations.
3225     //   It must be one longer than the pattern being checked because some  ops
3226     //   will jmp to a end-of-block+1 location from within a block, and we must
3227     //   count those when checking the block.
3228     UVector32  forwardedLength(end+2, *fStatus);
3229     forwardedLength.setSize(end+2);
3230     for (loc=start; loc<=end+1; loc++) {
3231         forwardedLength.setElementAt(INT32_MAX, loc);
3232     }
3233 
3234     for (loc = start; loc<=end; loc++) {
3235         op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3236         opType = URX_TYPE(op);
3237 
3238         // The loop is advancing linearly through the pattern.
3239         // If the op we are now at was the destination of a branch in the pattern,
3240         // and that path has a shorter minimum length than the current accumulated value,
3241         // replace the current accumulated value.
3242         // U_ASSERT(currentLen>=0 && currentLen < INT32_MAX);  // MinLength == INT32_MAX for some
3243                                                                //   no-match-possible cases.
3244         if (forwardedLength.elementAti(loc) < currentLen) {
3245             currentLen = forwardedLength.elementAti(loc);
3246             U_ASSERT(currentLen>=0 && currentLen < INT32_MAX);
3247         }
3248 
3249         switch (opType) {
3250             // Ops that don't change the total length matched
3251         case URX_RESERVED_OP:
3252         case URX_END:
3253         case URX_STRING_LEN:
3254         case URX_NOP:
3255         case URX_START_CAPTURE:
3256         case URX_END_CAPTURE:
3257         case URX_BACKSLASH_B:
3258         case URX_BACKSLASH_BU:
3259         case URX_BACKSLASH_G:
3260         case URX_BACKSLASH_Z:
3261         case URX_CARET:
3262         case URX_DOLLAR:
3263         case URX_DOLLAR_M:
3264         case URX_DOLLAR_D:
3265         case URX_DOLLAR_MD:
3266         case URX_RELOC_OPRND:
3267         case URX_STO_INP_LOC:
3268         case URX_CARET_M:
3269         case URX_CARET_M_UNIX:
3270         case URX_BACKREF:         // BackRef.  Must assume that it might be a zero length match
3271         case URX_BACKREF_I:
3272 
3273         case URX_STO_SP:          // Setup for atomic or possessive blocks.  Doesn't change what can match.
3274         case URX_LD_SP:
3275 
3276         case URX_JMP_SAV:
3277         case URX_JMP_SAV_X:
3278             break;
3279 
3280 
3281             // Ops that match a minimum of one character (one or two 16 bit code units.)
3282             //
3283         case URX_ONECHAR:
3284         case URX_STATIC_SETREF:
3285         case URX_STAT_SETREF_N:
3286         case URX_SETREF:
3287         case URX_BACKSLASH_D:
3288         case URX_BACKSLASH_H:
3289         case URX_BACKSLASH_R:
3290         case URX_BACKSLASH_V:
3291         case URX_ONECHAR_I:
3292         case URX_BACKSLASH_X:   // Grapheme Cluster.  Minimum is 1, max unbounded.
3293         case URX_DOTANY_ALL:    // . matches one or two.
3294         case URX_DOTANY:
3295         case URX_DOTANY_UNIX:
3296             currentLen = safeIncrement(currentLen, 1);
3297             break;
3298 
3299 
3300         case URX_JMPX:
3301             loc++;              // URX_JMPX has an extra operand, ignored here,
3302                                 //   otherwise processed identically to URX_JMP.
3303             U_FALLTHROUGH;
3304         case URX_JMP:
3305             {
3306                 int32_t  jmpDest = URX_VAL(op);
3307                 if (jmpDest < loc) {
3308                     // Loop of some kind.  Can safely ignore, the worst that will happen
3309                     //  is that we understate the true minimum length
3310                     currentLen = forwardedLength.elementAti(loc+1);
3311                 } else {
3312                     // Forward jump.  Propagate the current min length to the target loc of the jump.
3313                     U_ASSERT(jmpDest <= end+1);
3314                     if (forwardedLength.elementAti(jmpDest) > currentLen) {
3315                         forwardedLength.setElementAt(currentLen, jmpDest);
3316                     }
3317                 }
3318             }
3319             break;
3320 
3321         case URX_BACKTRACK:
3322             {
3323                 // Back-tracks are kind of like a branch, except that the min length was
3324                 //   propagated already, by the state save.
3325                 currentLen = forwardedLength.elementAti(loc+1);
3326             }
3327             break;
3328 
3329 
3330         case URX_STATE_SAVE:
3331             {
3332                 // State Save, for forward jumps, propagate the current minimum.
3333                 //             of the state save.
3334                 int32_t  jmpDest = URX_VAL(op);
3335                 if (jmpDest > loc) {
3336                     if (currentLen < forwardedLength.elementAti(jmpDest)) {
3337                         forwardedLength.setElementAt(currentLen, jmpDest);
3338                     }
3339                 }
3340             }
3341             break;
3342 
3343 
3344         case URX_STRING:
3345             {
3346                 loc++;
3347                 int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3348                 currentLen = safeIncrement(currentLen, URX_VAL(stringLenOp));
3349             }
3350             break;
3351 
3352 
3353         case URX_STRING_I:
3354             {
3355                 loc++;
3356                 // TODO: with full case folding, matching input text may be shorter than
3357                 //       the string we have here.  More smarts could put some bounds on it.
3358                 //       Assume a min length of one for now.  A min length of zero causes
3359                 //        optimization failures for a pattern like "string"+
3360                 // currentLen += URX_VAL(stringLenOp);
3361                 currentLen = safeIncrement(currentLen, 1);
3362             }
3363             break;
3364 
3365         case URX_CTR_INIT:
3366         case URX_CTR_INIT_NG:
3367             {
3368                 // Loop Init Ops.
3369                 //   If the min loop count == 0
3370                 //      move loc forwards to the end of the loop, skipping over the body.
3371                 //   If the min count is > 0,
3372                 //      continue normal processing of the body of the loop.
3373                 int32_t loopEndLoc   = (int32_t)fRXPat->fCompiledPat->elementAti(loc+1);
3374                         loopEndLoc   = URX_VAL(loopEndLoc);
3375                 int32_t minLoopCount = (int32_t)fRXPat->fCompiledPat->elementAti(loc+2);
3376                 if (minLoopCount == 0) {
3377                     loc = loopEndLoc;
3378                 } else {
3379                     loc+=3;  // Skips over operands of CTR_INIT
3380                 }
3381             }
3382             break;
3383 
3384 
3385         case URX_CTR_LOOP:
3386         case URX_CTR_LOOP_NG:
3387             // Loop ops.
3388             //  The jump is conditional, backwards only.
3389             break;
3390 
3391         case URX_LOOP_SR_I:
3392         case URX_LOOP_DOT_I:
3393         case URX_LOOP_C:
3394             // More loop ops.  These state-save to themselves.
3395             //   don't change the minimum match - could match nothing at all.
3396             break;
3397 
3398 
3399         case URX_LA_START:
3400         case URX_LB_START:
3401             {
3402                 // Look-around.  Scan forward until the matching look-ahead end,
3403                 //   without processing the look-around block.  This is overly pessimistic for look-ahead,
3404                 //   it assumes that the look-ahead match might be zero-length.
3405                 //   TODO:  Positive lookahead could recursively do the block, then continue
3406                 //          with the longer of the block or the value coming in.  Ticket 6060
3407                 int32_t  depth = (opType == URX_LA_START? 2: 1);
3408                 for (;;) {
3409                     loc++;
3410                     op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3411                     if (URX_TYPE(op) == URX_LA_START) {
3412                         // The boilerplate for look-ahead includes two LA_END instructions,
3413                         //    Depth will be decremented by each one when it is seen.
3414                         depth += 2;
3415                     }
3416                     if (URX_TYPE(op) == URX_LB_START) {
3417                         depth++;
3418                     }
3419                     if (URX_TYPE(op) == URX_LA_END) {
3420                         depth--;
3421                         if (depth == 0) {
3422                             break;
3423                         }
3424                     }
3425                     if (URX_TYPE(op)==URX_LBN_END) {
3426                         depth--;
3427                         if (depth == 0) {
3428                             break;
3429                         }
3430                     }
3431                     if (URX_TYPE(op) == URX_STATE_SAVE) {
3432                         // Need this because neg lookahead blocks will FAIL to outside
3433                         //   of the block.
3434                         int32_t  jmpDest = URX_VAL(op);
3435                         if (jmpDest > loc) {
3436                             if (currentLen < forwardedLength.elementAti(jmpDest)) {
3437                                 forwardedLength.setElementAt(currentLen, jmpDest);
3438                             }
3439                         }
3440                     }
3441                     U_ASSERT(loc <= end);
3442                 }
3443             }
3444             break;
3445 
3446         case URX_LA_END:
3447         case URX_LB_CONT:
3448         case URX_LB_END:
3449         case URX_LBN_CONT:
3450         case URX_LBN_END:
3451             // Only come here if the matching URX_LA_START or URX_LB_START was not in the
3452             //   range being sized, which happens when measuring size of look-behind blocks.
3453             break;
3454 
3455         default:
3456             UPRV_UNREACHABLE_EXIT;
3457             }
3458 
3459         }
3460 
3461     // We have finished walking through the ops.  Check whether some forward jump
3462     //   propagated a shorter length to location end+1.
3463     if (forwardedLength.elementAti(end+1) < currentLen) {
3464         currentLen = forwardedLength.elementAti(end+1);
3465         U_ASSERT(currentLen>=0 && currentLen < INT32_MAX);
3466     }
3467 
3468     return currentLen;
3469 }
3470 
3471 //------------------------------------------------------------------------------
3472 //
3473 //   maxMatchLength    Calculate the length of the longest string that could
3474 //                     match the specified pattern.
3475 //                     Length is in 16 bit code units, not code points.
3476 //
3477 //                     The calculated length may not be exact.  The returned
3478 //                     value may be longer than the actual maximum; it must
3479 //                     never be shorter.
3480 //
3481 //                     start, end: the range of the pattern to check.
3482 //                     end is inclusive.
3483 //
3484 //------------------------------------------------------------------------------
maxMatchLength(int32_t start,int32_t end)3485 int32_t   RegexCompile::maxMatchLength(int32_t start, int32_t end) {
3486     if (U_FAILURE(*fStatus)) {
3487         return 0;
3488     }
3489     U_ASSERT(start <= end);
3490     U_ASSERT(end < fRXPat->fCompiledPat->size());
3491 
3492     int32_t    loc;
3493     int32_t    op;
3494     int32_t    opType;
3495     int32_t    currentLen = 0;
3496     UVector32  forwardedLength(end+1, *fStatus);
3497     forwardedLength.setSize(end+1);
3498 
3499     for (loc=start; loc<=end; loc++) {
3500         forwardedLength.setElementAt(0, loc);
3501     }
3502 
3503     for (loc = start; loc<=end; loc++) {
3504         op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3505         opType = URX_TYPE(op);
3506 
3507         // The loop is advancing linearly through the pattern.
3508         // If the op we are now at was the destination of a branch in the pattern,
3509         // and that path has a longer maximum length than the current accumulated value,
3510         // replace the current accumulated value.
3511         if (forwardedLength.elementAti(loc) > currentLen) {
3512             currentLen = forwardedLength.elementAti(loc);
3513         }
3514 
3515         switch (opType) {
3516             // Ops that don't change the total length matched
3517         case URX_RESERVED_OP:
3518         case URX_END:
3519         case URX_STRING_LEN:
3520         case URX_NOP:
3521         case URX_START_CAPTURE:
3522         case URX_END_CAPTURE:
3523         case URX_BACKSLASH_B:
3524         case URX_BACKSLASH_BU:
3525         case URX_BACKSLASH_G:
3526         case URX_BACKSLASH_Z:
3527         case URX_CARET:
3528         case URX_DOLLAR:
3529         case URX_DOLLAR_M:
3530         case URX_DOLLAR_D:
3531         case URX_DOLLAR_MD:
3532         case URX_RELOC_OPRND:
3533         case URX_STO_INP_LOC:
3534         case URX_CARET_M:
3535         case URX_CARET_M_UNIX:
3536 
3537         case URX_STO_SP:          // Setup for atomic or possessive blocks.  Doesn't change what can match.
3538         case URX_LD_SP:
3539 
3540         case URX_LB_END:
3541         case URX_LB_CONT:
3542         case URX_LBN_CONT:
3543         case URX_LBN_END:
3544             break;
3545 
3546 
3547             // Ops that increase that cause an unbounded increase in the length
3548             //   of a matched string, or that increase it a hard to characterize way.
3549             //   Call the max length unbounded, and stop further checking.
3550         case URX_BACKREF:         // BackRef.  Must assume that it might be a zero length match
3551         case URX_BACKREF_I:
3552         case URX_BACKSLASH_X:   // Grapheme Cluster.  Minimum is 1, max unbounded.
3553             currentLen = INT32_MAX;
3554             break;
3555 
3556 
3557             // Ops that match a max of one character (possibly two 16 bit code units.)
3558             //
3559         case URX_STATIC_SETREF:
3560         case URX_STAT_SETREF_N:
3561         case URX_SETREF:
3562         case URX_BACKSLASH_D:
3563         case URX_BACKSLASH_H:
3564         case URX_BACKSLASH_R:
3565         case URX_BACKSLASH_V:
3566         case URX_ONECHAR_I:
3567         case URX_DOTANY_ALL:
3568         case URX_DOTANY:
3569         case URX_DOTANY_UNIX:
3570             currentLen = safeIncrement(currentLen, 2);
3571             break;
3572 
3573             // Single literal character.  Increase current max length by one or two,
3574             //       depending on whether the char is in the supplementary range.
3575         case URX_ONECHAR:
3576             currentLen = safeIncrement(currentLen, 1);
3577             if (URX_VAL(op) > 0x10000) {
3578                 currentLen = safeIncrement(currentLen, 1);
3579             }
3580             break;
3581 
3582             // Jumps.
3583             //
3584         case URX_JMP:
3585         case URX_JMPX:
3586         case URX_JMP_SAV:
3587         case URX_JMP_SAV_X:
3588             {
3589                 int32_t  jmpDest = URX_VAL(op);
3590                 if (jmpDest < loc) {
3591                     // Loop of some kind.  Max match length is unbounded.
3592                     currentLen = INT32_MAX;
3593                 } else {
3594                     // Forward jump.  Propagate the current min length to the target loc of the jump.
3595                     if (forwardedLength.elementAti(jmpDest) < currentLen) {
3596                         forwardedLength.setElementAt(currentLen, jmpDest);
3597                     }
3598                     currentLen = 0;
3599                 }
3600             }
3601             break;
3602 
3603         case URX_BACKTRACK:
3604             // back-tracks are kind of like a branch, except that the max length was
3605             //   propagated already, by the state save.
3606             currentLen = forwardedLength.elementAti(loc+1);
3607             break;
3608 
3609 
3610         case URX_STATE_SAVE:
3611             {
3612                 // State Save, for forward jumps, propagate the current minimum.
3613                 //               of the state save.
3614                 //             For backwards jumps, they create a loop, maximum
3615                 //               match length is unbounded.
3616                 int32_t  jmpDest = URX_VAL(op);
3617                 if (jmpDest > loc) {
3618                     if (currentLen > forwardedLength.elementAti(jmpDest)) {
3619                         forwardedLength.setElementAt(currentLen, jmpDest);
3620                     }
3621                 } else {
3622                     currentLen = INT32_MAX;
3623                 }
3624             }
3625             break;
3626 
3627 
3628 
3629 
3630         case URX_STRING:
3631             {
3632                 loc++;
3633                 int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3634                 currentLen = safeIncrement(currentLen, URX_VAL(stringLenOp));
3635                 break;
3636             }
3637 
3638         case URX_STRING_I:
3639             // TODO:  This code assumes that any user string that matches will be no longer
3640             //        than our compiled string, with case insensitive matching.
3641             //        Our compiled string has been case-folded already.
3642             //
3643             //        Any matching user string will have no more code points than our
3644             //        compiled (folded) string.  Folding may add code points, but
3645             //        not remove them.
3646             //
3647             //        There is a potential problem if a supplemental code point
3648             //        case-folds to a BMP code point.  In this case our compiled string
3649             //        could be shorter (in code units) than a matching user string.
3650             //
3651             //        At this time (Unicode 6.1) there are no such characters, and this case
3652             //        is not being handled.  A test, intltest regex/Bug9283, will fail if
3653             //        any problematic characters are added to Unicode.
3654             //
3655             //        If this happens, we can make a set of the BMP chars that the
3656             //        troublesome supplementals fold to, scan our string, and bump the
3657             //        currentLen one extra for each that is found.
3658             //
3659             {
3660                 loc++;
3661                 int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3662                 currentLen = safeIncrement(currentLen, URX_VAL(stringLenOp));
3663             }
3664             break;
3665 
3666         case URX_CTR_INIT:
3667         case URX_CTR_INIT_NG:
3668             // For Loops, recursively call this function on the pattern for the loop body,
3669             //   then multiply the result by the maximum loop count.
3670             {
3671                 int32_t  loopEndLoc = URX_VAL(fRXPat->fCompiledPat->elementAti(loc+1));
3672                 if (loopEndLoc == loc+4) {
3673                     // Loop has an empty body. No affect on max match length.
3674                     // Continue processing with code after the loop end.
3675                     loc = loopEndLoc;
3676                     break;
3677                 }
3678 
3679                 int32_t maxLoopCount = static_cast<int32_t>(fRXPat->fCompiledPat->elementAti(loc+3));
3680                 if (maxLoopCount == -1) {
3681                     // Unbounded Loop. No upper bound on match length.
3682                     currentLen = INT32_MAX;
3683                     break;
3684                 }
3685 
3686                 U_ASSERT(loopEndLoc >= loc+4);
3687                 int64_t blockLen = maxMatchLength(loc+4, loopEndLoc-1);  // Recursive call.
3688                 int64_t updatedLen = (int64_t)currentLen + blockLen * maxLoopCount;
3689                 if (updatedLen >= INT32_MAX) {
3690                     currentLen = INT32_MAX;
3691                     break;
3692                 }
3693                 currentLen = (int32_t)updatedLen;
3694                 loc = loopEndLoc;
3695                 break;
3696             }
3697 
3698         case URX_CTR_LOOP:
3699         case URX_CTR_LOOP_NG:
3700             // These opcodes will be skipped over by code for URX_CTR_INIT.
3701             // We shouldn't encounter them here.
3702             UPRV_UNREACHABLE_EXIT;
3703 
3704         case URX_LOOP_SR_I:
3705         case URX_LOOP_DOT_I:
3706         case URX_LOOP_C:
3707             // For anything to do with loops, make the match length unbounded.
3708             currentLen = INT32_MAX;
3709             break;
3710 
3711 
3712 
3713         case URX_LA_START:
3714         case URX_LA_END:
3715             // Look-ahead.  Just ignore, treat the look-ahead block as if
3716             // it were normal pattern.  Gives a too-long match length,
3717             //  but good enough for now.
3718             break;
3719 
3720             // End of look-ahead ops should always be consumed by the processing at
3721             //  the URX_LA_START op.
3722             // UPRV_UNREACHABLE_EXIT;
3723 
3724         case URX_LB_START:
3725             {
3726                 // Look-behind.  Scan forward until the matching look-around end,
3727                 //   without processing the look-behind block.
3728                 int32_t dataLoc = URX_VAL(op);
3729                 for (loc = loc + 1; loc <= end; ++loc) {
3730                     op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3731                     int32_t opType = URX_TYPE(op);
3732                     if ((opType == URX_LA_END || opType == URX_LBN_END) && (URX_VAL(op) == dataLoc)) {
3733                         break;
3734                     }
3735                 }
3736                 U_ASSERT(loc <= end);
3737             }
3738             break;
3739 
3740         default:
3741             UPRV_UNREACHABLE_EXIT;
3742         }
3743 
3744 
3745         if (currentLen == INT32_MAX) {
3746             //  The maximum length is unbounded.
3747             //  Stop further processing of the pattern.
3748             break;
3749         }
3750 
3751     }
3752     return currentLen;
3753 
3754 }
3755 
3756 
3757 //------------------------------------------------------------------------------
3758 //
3759 //   stripNOPs    Remove any NOP operations from the compiled pattern code.
3760 //                Extra NOPs are inserted for some constructs during the initial
3761 //                code generation to provide locations that may be patched later.
3762 //                Many end up unneeded, and are removed by this function.
3763 //
3764 //                In order to minimize the number of passes through the pattern,
3765 //                back-reference fixup is also performed here (adjusting
3766 //                back-reference operands to point to the correct frame offsets).
3767 //
3768 //------------------------------------------------------------------------------
stripNOPs()3769 void RegexCompile::stripNOPs() {
3770 
3771     if (U_FAILURE(*fStatus)) {
3772         return;
3773     }
3774 
3775     int32_t    end = fRXPat->fCompiledPat->size();
3776     UVector32  deltas(end, *fStatus);
3777 
3778     // Make a first pass over the code, computing the amount that things
3779     //   will be offset at each location in the original code.
3780     int32_t   loc;
3781     int32_t   d = 0;
3782     for (loc=0; loc<end; loc++) {
3783         deltas.addElement(d, *fStatus);
3784         int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3785         if (URX_TYPE(op) == URX_NOP) {
3786             d++;
3787         }
3788     }
3789 
3790     UnicodeString caseStringBuffer;
3791 
3792     // Make a second pass over the code, removing the NOPs by moving following
3793     //  code up, and patching operands that refer to code locations that
3794     //  are being moved.  The array of offsets from the first step is used
3795     //  to compute the new operand values.
3796     int32_t src;
3797     int32_t dst = 0;
3798     for (src=0; src<end; src++) {
3799         int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(src);
3800         int32_t opType = URX_TYPE(op);
3801         switch (opType) {
3802         case URX_NOP:
3803             break;
3804 
3805         case URX_STATE_SAVE:
3806         case URX_JMP:
3807         case URX_CTR_LOOP:
3808         case URX_CTR_LOOP_NG:
3809         case URX_RELOC_OPRND:
3810         case URX_JMPX:
3811         case URX_JMP_SAV:
3812         case URX_JMP_SAV_X:
3813             // These are instructions with operands that refer to code locations.
3814             {
3815                 int32_t  operandAddress = URX_VAL(op);
3816                 U_ASSERT(operandAddress>=0 && operandAddress<deltas.size());
3817                 int32_t fixedOperandAddress = operandAddress - deltas.elementAti(operandAddress);
3818                 op = buildOp(opType, fixedOperandAddress);
3819                 fRXPat->fCompiledPat->setElementAt(op, dst);
3820                 dst++;
3821                 break;
3822             }
3823 
3824         case URX_BACKREF:
3825         case URX_BACKREF_I:
3826             {
3827                 int32_t where = URX_VAL(op);
3828                 if (where > fRXPat->fGroupMap->size()) {
3829                     error(U_REGEX_INVALID_BACK_REF);
3830                     break;
3831                 }
3832                 where = fRXPat->fGroupMap->elementAti(where-1);
3833                 op    = buildOp(opType, where);
3834                 fRXPat->fCompiledPat->setElementAt(op, dst);
3835                 dst++;
3836 
3837                 fRXPat->fNeedsAltInput = TRUE;
3838                 break;
3839             }
3840         case URX_RESERVED_OP:
3841         case URX_RESERVED_OP_N:
3842         case URX_BACKTRACK:
3843         case URX_END:
3844         case URX_ONECHAR:
3845         case URX_STRING:
3846         case URX_STRING_LEN:
3847         case URX_START_CAPTURE:
3848         case URX_END_CAPTURE:
3849         case URX_STATIC_SETREF:
3850         case URX_STAT_SETREF_N:
3851         case URX_SETREF:
3852         case URX_DOTANY:
3853         case URX_FAIL:
3854         case URX_BACKSLASH_B:
3855         case URX_BACKSLASH_BU:
3856         case URX_BACKSLASH_G:
3857         case URX_BACKSLASH_X:
3858         case URX_BACKSLASH_Z:
3859         case URX_DOTANY_ALL:
3860         case URX_BACKSLASH_D:
3861         case URX_CARET:
3862         case URX_DOLLAR:
3863         case URX_CTR_INIT:
3864         case URX_CTR_INIT_NG:
3865         case URX_DOTANY_UNIX:
3866         case URX_STO_SP:
3867         case URX_LD_SP:
3868         case URX_STO_INP_LOC:
3869         case URX_LA_START:
3870         case URX_LA_END:
3871         case URX_ONECHAR_I:
3872         case URX_STRING_I:
3873         case URX_DOLLAR_M:
3874         case URX_CARET_M:
3875         case URX_CARET_M_UNIX:
3876         case URX_LB_START:
3877         case URX_LB_CONT:
3878         case URX_LB_END:
3879         case URX_LBN_CONT:
3880         case URX_LBN_END:
3881         case URX_LOOP_SR_I:
3882         case URX_LOOP_DOT_I:
3883         case URX_LOOP_C:
3884         case URX_DOLLAR_D:
3885         case URX_DOLLAR_MD:
3886         case URX_BACKSLASH_H:
3887         case URX_BACKSLASH_R:
3888         case URX_BACKSLASH_V:
3889             // These instructions are unaltered by the relocation.
3890             fRXPat->fCompiledPat->setElementAt(op, dst);
3891             dst++;
3892             break;
3893 
3894         default:
3895             // Some op is unaccounted for.
3896             UPRV_UNREACHABLE_EXIT;
3897         }
3898     }
3899 
3900     fRXPat->fCompiledPat->setSize(dst);
3901 }
3902 
3903 
3904 
3905 
3906 //------------------------------------------------------------------------------
3907 //
3908 //  Error         Report a rule parse error.
3909 //                Only report it if no previous error has been recorded.
3910 //
3911 //------------------------------------------------------------------------------
error(UErrorCode e)3912 void RegexCompile::error(UErrorCode e) {
3913     if (U_SUCCESS(*fStatus) || e == U_MEMORY_ALLOCATION_ERROR) {
3914         *fStatus = e;
3915         // Hmm. fParseErr (UParseError) line & offset fields are int32_t in public
3916         // API (see common/unicode/parseerr.h), while fLineNum and fCharNum are
3917         // int64_t. If the values of the latter are out of range for the former,
3918         // set them to the appropriate "field not supported" values.
3919         if (fLineNum > 0x7FFFFFFF) {
3920             fParseErr->line   = 0;
3921             fParseErr->offset = -1;
3922         } else if (fCharNum > 0x7FFFFFFF) {
3923             fParseErr->line   = (int32_t)fLineNum;
3924             fParseErr->offset = -1;
3925         } else {
3926             fParseErr->line   = (int32_t)fLineNum;
3927             fParseErr->offset = (int32_t)fCharNum;
3928         }
3929 
3930         UErrorCode status = U_ZERO_ERROR; // throwaway status for extracting context
3931 
3932         // Fill in the context.
3933         //   Note: extractBetween() pins supplied indices to the string bounds.
3934         uprv_memset(fParseErr->preContext,  0, sizeof(fParseErr->preContext));
3935         uprv_memset(fParseErr->postContext, 0, sizeof(fParseErr->postContext));
3936         utext_extract(fRXPat->fPattern, fScanIndex-U_PARSE_CONTEXT_LEN+1, fScanIndex, fParseErr->preContext, U_PARSE_CONTEXT_LEN, &status);
3937         utext_extract(fRXPat->fPattern, fScanIndex, fScanIndex+U_PARSE_CONTEXT_LEN-1, fParseErr->postContext, U_PARSE_CONTEXT_LEN, &status);
3938     }
3939 }
3940 
3941 
3942 //
3943 //  Assorted Unicode character constants.
3944 //     Numeric because there is no portable way to enter them as literals.
3945 //     (Think EBCDIC).
3946 //
3947 static const UChar      chCR        = 0x0d;      // New lines, for terminating comments.
3948 static const UChar      chLF        = 0x0a;      // Line Feed
3949 static const UChar      chPound     = 0x23;      // '#', introduces a comment.
3950 static const UChar      chDigit0    = 0x30;      // '0'
3951 static const UChar      chDigit7    = 0x37;      // '9'
3952 static const UChar      chColon     = 0x3A;      // ':'
3953 static const UChar      chE         = 0x45;      // 'E'
3954 static const UChar      chQ         = 0x51;      // 'Q'
3955 //static const UChar      chN         = 0x4E;      // 'N'
3956 static const UChar      chP         = 0x50;      // 'P'
3957 static const UChar      chBackSlash = 0x5c;      // '\'  introduces a char escape
3958 //static const UChar      chLBracket  = 0x5b;      // '['
3959 static const UChar      chRBracket  = 0x5d;      // ']'
3960 static const UChar      chUp        = 0x5e;      // '^'
3961 static const UChar      chLowerP    = 0x70;
3962 static const UChar      chLBrace    = 0x7b;      // '{'
3963 static const UChar      chRBrace    = 0x7d;      // '}'
3964 static const UChar      chNEL       = 0x85;      //    NEL newline variant
3965 static const UChar      chLS        = 0x2028;    //    Unicode Line Separator
3966 
3967 
3968 //------------------------------------------------------------------------------
3969 //
3970 //  nextCharLL    Low Level Next Char from the regex pattern.
3971 //                Get a char from the string, keep track of input position
3972 //                     for error reporting.
3973 //
3974 //------------------------------------------------------------------------------
nextCharLL()3975 UChar32  RegexCompile::nextCharLL() {
3976     UChar32       ch;
3977 
3978     if (fPeekChar != -1) {
3979         ch = fPeekChar;
3980         fPeekChar = -1;
3981         return ch;
3982     }
3983 
3984     // assume we're already in the right place
3985     ch = UTEXT_NEXT32(fRXPat->fPattern);
3986     if (ch == U_SENTINEL) {
3987         return ch;
3988     }
3989 
3990     if (ch == chCR ||
3991         ch == chNEL ||
3992         ch == chLS   ||
3993         (ch == chLF && fLastChar != chCR)) {
3994         // Character is starting a new line.  Bump up the line number, and
3995         //  reset the column to 0.
3996         fLineNum++;
3997         fCharNum=0;
3998     }
3999     else {
4000         // Character is not starting a new line.  Except in the case of a
4001         //   LF following a CR, increment the column position.
4002         if (ch != chLF) {
4003             fCharNum++;
4004         }
4005     }
4006     fLastChar = ch;
4007     return ch;
4008 }
4009 
4010 //------------------------------------------------------------------------------
4011 //
4012 //   peekCharLL    Low Level Character Scanning, sneak a peek at the next
4013 //                 character without actually getting it.
4014 //
4015 //------------------------------------------------------------------------------
peekCharLL()4016 UChar32  RegexCompile::peekCharLL() {
4017     if (fPeekChar == -1) {
4018         fPeekChar = nextCharLL();
4019     }
4020     return fPeekChar;
4021 }
4022 
4023 
4024 //------------------------------------------------------------------------------
4025 //
4026 //   nextChar     for pattern scanning.  At this level, we handle stripping
4027 //                out comments and processing some backslash character escapes.
4028 //                The rest of the pattern grammar is handled at the next level up.
4029 //
4030 //------------------------------------------------------------------------------
nextChar(RegexPatternChar & c)4031 void RegexCompile::nextChar(RegexPatternChar &c) {
4032   tailRecursion:
4033     fScanIndex = UTEXT_GETNATIVEINDEX(fRXPat->fPattern);
4034     c.fChar    = nextCharLL();
4035     c.fQuoted  = FALSE;
4036 
4037     if (fQuoteMode) {
4038         c.fQuoted = TRUE;
4039         if ((c.fChar==chBackSlash && peekCharLL()==chE && ((fModeFlags & UREGEX_LITERAL) == 0)) ||
4040             c.fChar == (UChar32)-1) {
4041             fQuoteMode = FALSE;  //  Exit quote mode,
4042             nextCharLL();        // discard the E
4043             // nextChar(c);      // recurse to get the real next char
4044             goto tailRecursion;  // Note: fuzz testing produced testcases that
4045                                  //       resulted in stack overflow here.
4046         }
4047     }
4048     else if (fInBackslashQuote) {
4049         // The current character immediately follows a '\'
4050         // Don't check for any further escapes, just return it as-is.
4051         // Don't set c.fQuoted, because that would prevent the state machine from
4052         //    dispatching on the character.
4053         fInBackslashQuote = FALSE;
4054     }
4055     else
4056     {
4057         // We are not in a \Q quoted region \E of the source.
4058         //
4059         if (fModeFlags & UREGEX_COMMENTS) {
4060             //
4061             // We are in free-spacing and comments mode.
4062             //  Scan through any white space and comments, until we
4063             //  reach a significant character or the end of input.
4064             for (;;) {
4065                 if (c.fChar == (UChar32)-1) {
4066                     break;     // End of Input
4067                 }
4068                 if  (c.fChar == chPound && fEOLComments == TRUE) {
4069                     // Start of a comment.  Consume the rest of it, until EOF or a new line
4070                     for (;;) {
4071                         c.fChar = nextCharLL();
4072                         if (c.fChar == (UChar32)-1 ||  // EOF
4073                             c.fChar == chCR        ||
4074                             c.fChar == chLF        ||
4075                             c.fChar == chNEL       ||
4076                             c.fChar == chLS)       {
4077                             break;
4078                         }
4079                     }
4080                 }
4081                 // TODO:  check what Java & Perl do with non-ASCII white spaces.  Ticket 6061.
4082                 if (PatternProps::isWhiteSpace(c.fChar) == FALSE) {
4083                     break;
4084                 }
4085                 c.fChar = nextCharLL();
4086             }
4087         }
4088 
4089         //
4090         //  check for backslash escaped characters.
4091         //
4092         if (c.fChar == chBackSlash) {
4093             int64_t pos = UTEXT_GETNATIVEINDEX(fRXPat->fPattern);
4094             if (RegexStaticSets::gStaticSets->fUnescapeCharSet.contains(peekCharLL())) {
4095                 //
4096                 // A '\' sequence that is handled by ICU's standard unescapeAt function.
4097                 //   Includes \uxxxx, \n, \r, many others.
4098                 //   Return the single equivalent character.
4099                 //
4100                 nextCharLL();                 // get & discard the peeked char.
4101                 c.fQuoted = TRUE;
4102 
4103                 if (UTEXT_FULL_TEXT_IN_CHUNK(fRXPat->fPattern, fPatternLength)) {
4104                     int32_t endIndex = (int32_t)pos;
4105                     c.fChar = u_unescapeAt(uregex_ucstr_unescape_charAt, &endIndex, (int32_t)fPatternLength, (void *)fRXPat->fPattern->chunkContents);
4106 
4107                     if (endIndex == pos) {
4108                         error(U_REGEX_BAD_ESCAPE_SEQUENCE);
4109                     }
4110                     fCharNum += endIndex - pos;
4111                     UTEXT_SETNATIVEINDEX(fRXPat->fPattern, endIndex);
4112                 } else {
4113                     int32_t offset = 0;
4114                     struct URegexUTextUnescapeCharContext context = U_REGEX_UTEXT_UNESCAPE_CONTEXT(fRXPat->fPattern);
4115 
4116                     UTEXT_SETNATIVEINDEX(fRXPat->fPattern, pos);
4117                     c.fChar = u_unescapeAt(uregex_utext_unescape_charAt, &offset, INT32_MAX, &context);
4118 
4119                     if (offset == 0) {
4120                         error(U_REGEX_BAD_ESCAPE_SEQUENCE);
4121                     } else if (context.lastOffset == offset) {
4122                         UTEXT_PREVIOUS32(fRXPat->fPattern);
4123                     } else if (context.lastOffset != offset-1) {
4124                         utext_moveIndex32(fRXPat->fPattern, offset - context.lastOffset - 1);
4125                     }
4126                     fCharNum += offset;
4127                 }
4128             }
4129             else if (peekCharLL() == chDigit0) {
4130                 //  Octal Escape, using Java Regexp Conventions
4131                 //    which are \0 followed by 1-3 octal digits.
4132                 //    Different from ICU Unescape handling of Octal, which does not
4133                 //    require the leading 0.
4134                 //  Java also has the convention of only consuming 2 octal digits if
4135                 //    the three digit number would be > 0xff
4136                 //
4137                 c.fChar = 0;
4138                 nextCharLL();    // Consume the initial 0.
4139                 int index;
4140                 for (index=0; index<3; index++) {
4141                     int32_t ch = peekCharLL();
4142                     if (ch<chDigit0 || ch>chDigit7) {
4143                         if (index==0) {
4144                            // \0 is not followed by any octal digits.
4145                            error(U_REGEX_BAD_ESCAPE_SEQUENCE);
4146                         }
4147                         break;
4148                     }
4149                     c.fChar <<= 3;
4150                     c.fChar += ch&7;
4151                     if (c.fChar <= 255) {
4152                         nextCharLL();
4153                     } else {
4154                         // The last digit made the number too big.  Forget we saw it.
4155                         c.fChar >>= 3;
4156                     }
4157                 }
4158                 c.fQuoted = TRUE;
4159             }
4160             else if (peekCharLL() == chQ) {
4161                 //  "\Q"  enter quote mode, which will continue until "\E"
4162                 fQuoteMode = TRUE;
4163                 nextCharLL();        // discard the 'Q'.
4164                 // nextChar(c);      // recurse to get the real next char.
4165                 goto tailRecursion;  // Note: fuzz testing produced test cases that
4166                 //                            resulted in stack overflow here.
4167             }
4168             else
4169             {
4170                 // We are in a '\' escape that will be handled by the state table scanner.
4171                 // Just return the backslash, but remember that the following char is to
4172                 //  be taken literally.
4173                 fInBackslashQuote = TRUE;
4174             }
4175         }
4176     }
4177 
4178     // re-enable # to end-of-line comments, in case they were disabled.
4179     // They are disabled by the parser upon seeing '(?', but this lasts for
4180     //  the fetching of the next character only.
4181     fEOLComments = TRUE;
4182 
4183     // putc(c.fChar, stdout);
4184 }
4185 
4186 
4187 
4188 //------------------------------------------------------------------------------
4189 //
4190 //  scanNamedChar
4191 //            Get a UChar32 from a \N{UNICODE CHARACTER NAME} in the pattern.
4192 //
4193 //             The scan position will be at the 'N'.  On return
4194 //             the scan position should be just after the '}'
4195 //
4196 //             Return the UChar32
4197 //
4198 //------------------------------------------------------------------------------
scanNamedChar()4199 UChar32  RegexCompile::scanNamedChar() {
4200     if (U_FAILURE(*fStatus)) {
4201         return 0;
4202     }
4203 
4204     nextChar(fC);
4205     if (fC.fChar != chLBrace) {
4206         error(U_REGEX_PROPERTY_SYNTAX);
4207         return 0;
4208     }
4209 
4210     UnicodeString  charName;
4211     for (;;) {
4212         nextChar(fC);
4213         if (fC.fChar == chRBrace) {
4214             break;
4215         }
4216         if (fC.fChar == -1) {
4217             error(U_REGEX_PROPERTY_SYNTAX);
4218             return 0;
4219         }
4220         charName.append(fC.fChar);
4221     }
4222 
4223     char name[100];
4224     if (!uprv_isInvariantUString(charName.getBuffer(), charName.length()) ||
4225          (uint32_t)charName.length()>=sizeof(name)) {
4226         // All Unicode character names have only invariant characters.
4227         // The API to get a character, given a name, accepts only char *, forcing us to convert,
4228         //   which requires this error check
4229         error(U_REGEX_PROPERTY_SYNTAX);
4230         return 0;
4231     }
4232     charName.extract(0, charName.length(), name, sizeof(name), US_INV);
4233 
4234     UChar32  theChar = u_charFromName(U_UNICODE_CHAR_NAME, name, fStatus);
4235     if (U_FAILURE(*fStatus)) {
4236         error(U_REGEX_PROPERTY_SYNTAX);
4237     }
4238 
4239     nextChar(fC);      // Continue overall regex pattern processing with char after the '}'
4240     return theChar;
4241 }
4242 
4243 //------------------------------------------------------------------------------
4244 //
4245 //  scanProp   Construct a UnicodeSet from the text at the current scan
4246 //             position, which will be of the form \p{whaterver}
4247 //
4248 //             The scan position will be at the 'p' or 'P'.  On return
4249 //             the scan position should be just after the '}'
4250 //
4251 //             Return a UnicodeSet, constructed from the \P pattern,
4252 //             or NULL if the pattern is invalid.
4253 //
4254 //------------------------------------------------------------------------------
scanProp()4255 UnicodeSet *RegexCompile::scanProp() {
4256     UnicodeSet    *uset = NULL;
4257 
4258     if (U_FAILURE(*fStatus)) {
4259         return NULL;
4260     }
4261     (void)chLowerP;   // Suppress compiler unused variable warning.
4262     U_ASSERT(fC.fChar == chLowerP || fC.fChar == chP);
4263     UBool negated = (fC.fChar == chP);
4264 
4265     UnicodeString propertyName;
4266     nextChar(fC);
4267     if (fC.fChar != chLBrace) {
4268         error(U_REGEX_PROPERTY_SYNTAX);
4269         return NULL;
4270     }
4271     for (;;) {
4272         nextChar(fC);
4273         if (fC.fChar == chRBrace) {
4274             break;
4275         }
4276         if (fC.fChar == -1) {
4277             // Hit the end of the input string without finding the closing '}'
4278             error(U_REGEX_PROPERTY_SYNTAX);
4279             return NULL;
4280         }
4281         propertyName.append(fC.fChar);
4282     }
4283     uset = createSetForProperty(propertyName, negated);
4284     nextChar(fC);    // Move input scan to position following the closing '}'
4285     return uset;
4286 }
4287 
4288 //------------------------------------------------------------------------------
4289 //
4290 //  scanPosixProp   Construct a UnicodeSet from the text at the current scan
4291 //             position, which is expected be of the form [:property expression:]
4292 //
4293 //             The scan position will be at the opening ':'.  On return
4294 //             the scan position must be on the closing ']'
4295 //
4296 //             Return a UnicodeSet constructed from the pattern,
4297 //             or NULL if this is not a valid POSIX-style set expression.
4298 //             If not a property expression, restore the initial scan position
4299 //                (to the opening ':')
4300 //
4301 //               Note:  the opening '[:' is not sufficient to guarantee that
4302 //                      this is a [:property:] expression.
4303 //                      [:'+=,] is a perfectly good ordinary set expression that
4304 //                              happens to include ':' as one of its characters.
4305 //
4306 //------------------------------------------------------------------------------
scanPosixProp()4307 UnicodeSet *RegexCompile::scanPosixProp() {
4308     UnicodeSet    *uset = NULL;
4309 
4310     if (U_FAILURE(*fStatus)) {
4311         return NULL;
4312     }
4313 
4314     U_ASSERT(fC.fChar == chColon);
4315 
4316     // Save the scanner state.
4317     // TODO:  move this into the scanner, with the state encapsulated in some way.  Ticket 6062
4318     int64_t     savedScanIndex        = fScanIndex;
4319     int64_t     savedNextIndex        = UTEXT_GETNATIVEINDEX(fRXPat->fPattern);
4320     UBool       savedQuoteMode        = fQuoteMode;
4321     UBool       savedInBackslashQuote = fInBackslashQuote;
4322     UBool       savedEOLComments      = fEOLComments;
4323     int64_t     savedLineNum          = fLineNum;
4324     int64_t     savedCharNum          = fCharNum;
4325     UChar32     savedLastChar         = fLastChar;
4326     UChar32     savedPeekChar         = fPeekChar;
4327     RegexPatternChar savedfC          = fC;
4328 
4329     // Scan for a closing ].   A little tricky because there are some perverse
4330     //   edge cases possible.  "[:abc\Qdef:] \E]"  is a valid non-property expression,
4331     //   ending on the second closing ].
4332 
4333     UnicodeString propName;
4334     UBool         negated  = FALSE;
4335 
4336     // Check for and consume the '^' in a negated POSIX property, e.g.  [:^Letter:]
4337     nextChar(fC);
4338     if (fC.fChar == chUp) {
4339        negated = TRUE;
4340        nextChar(fC);
4341     }
4342 
4343     // Scan for the closing ":]", collecting the property name along the way.
4344     UBool  sawPropSetTerminator = FALSE;
4345     for (;;) {
4346         propName.append(fC.fChar);
4347         nextChar(fC);
4348         if (fC.fQuoted || fC.fChar == -1) {
4349             // Escaped characters or end of input - either says this isn't a [:Property:]
4350             break;
4351         }
4352         if (fC.fChar == chColon) {
4353             nextChar(fC);
4354             if (fC.fChar == chRBracket) {
4355                 sawPropSetTerminator = TRUE;
4356             }
4357             break;
4358         }
4359     }
4360 
4361     if (sawPropSetTerminator) {
4362         uset = createSetForProperty(propName, negated);
4363     }
4364     else
4365     {
4366         // No closing ":]".
4367         //  Restore the original scan position.
4368         //  The main scanner will retry the input as a normal set expression,
4369         //    not a [:Property:] expression.
4370         fScanIndex        = savedScanIndex;
4371         fQuoteMode        = savedQuoteMode;
4372         fInBackslashQuote = savedInBackslashQuote;
4373         fEOLComments      = savedEOLComments;
4374         fLineNum          = savedLineNum;
4375         fCharNum          = savedCharNum;
4376         fLastChar         = savedLastChar;
4377         fPeekChar         = savedPeekChar;
4378         fC                = savedfC;
4379         UTEXT_SETNATIVEINDEX(fRXPat->fPattern, savedNextIndex);
4380     }
4381     return uset;
4382 }
4383 
addIdentifierIgnorable(UnicodeSet * set,UErrorCode & ec)4384 static inline void addIdentifierIgnorable(UnicodeSet *set, UErrorCode& ec) {
4385     set->add(0, 8).add(0x0e, 0x1b).add(0x7f, 0x9f);
4386     addCategory(set, U_GC_CF_MASK, ec);
4387 }
4388 
4389 //
4390 //  Create a Unicode Set from a Unicode Property expression.
4391 //     This is common code underlying both \p{...} and [:...:] expressions.
4392 //     Includes trying the Java "properties" that aren't supported as
4393 //     normal ICU UnicodeSet properties
4394 //
createSetForProperty(const UnicodeString & propName,UBool negated)4395 UnicodeSet *RegexCompile::createSetForProperty(const UnicodeString &propName, UBool negated) {
4396 
4397     if (U_FAILURE(*fStatus)) {
4398         return nullptr;
4399     }
4400     LocalPointer<UnicodeSet> set;
4401     UErrorCode status = U_ZERO_ERROR;
4402 
4403     do {      // non-loop, exists to allow breaks from the block.
4404         //
4405         //  First try the property as we received it
4406         //
4407         UnicodeString   setExpr;
4408         uint32_t        usetFlags = 0;
4409         setExpr.append(u"[\\p{", -1);
4410         setExpr.append(propName);
4411         setExpr.append(u"}]", -1);
4412         if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
4413             usetFlags |= USET_CASE_INSENSITIVE;
4414         }
4415         set.adoptInsteadAndCheckErrorCode(new UnicodeSet(setExpr, usetFlags, NULL, status), status);
4416         if (U_SUCCESS(status) || status == U_MEMORY_ALLOCATION_ERROR) {
4417             break;
4418         }
4419 
4420         //
4421         //  The incoming property wasn't directly recognized by ICU.
4422 
4423         //  Check [:word:] and [:all:]. These are not recognized as a properties by ICU UnicodeSet.
4424         //     Java accepts 'word' with mixed case.
4425         //     Java accepts 'all' only in all lower case.
4426 
4427         status = U_ZERO_ERROR;
4428         if (propName.caseCompare(u"word", -1, 0) == 0) {
4429             set.adoptInsteadAndCheckErrorCode(
4430                 RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET].cloneAsThawed(), status);
4431             break;
4432         }
4433         if (propName.compare(u"all", -1) == 0) {
4434             set.adoptInsteadAndCheckErrorCode(new UnicodeSet(0, 0x10ffff), status);
4435             break;
4436         }
4437 
4438 
4439         //    Do Java InBlock expressions
4440         //
4441         UnicodeString mPropName = propName;
4442         if (mPropName.startsWith(u"In", 2) && mPropName.length() >= 3) {
4443             status = U_ZERO_ERROR;
4444             set.adoptInsteadAndCheckErrorCode(new UnicodeSet(), status);
4445             if (U_FAILURE(status)) {
4446                 break;
4447             }
4448             UnicodeString blockName(mPropName, 2);  // Property with the leading "In" removed.
4449             set->applyPropertyAlias(UnicodeString(u"Block"), blockName, status);
4450             break;
4451         }
4452 
4453         //  Check for the Java form "IsBooleanPropertyValue", which we will recast
4454         //  as "BooleanPropertyValue". The property value can be either a
4455         //  a General Category or a Script Name.
4456 
4457         if (propName.startsWith(u"Is", 2) && propName.length()>=3) {
4458             mPropName.remove(0, 2);      // Strip the "Is"
4459             if (mPropName.indexOf(u'=') >= 0) {
4460                 // Reject any "Is..." property expression containing an '=', that is,
4461                 // any non-binary property expression.
4462                 status = U_REGEX_PROPERTY_SYNTAX;
4463                 break;
4464             }
4465 
4466             if (mPropName.caseCompare(u"assigned", -1, 0) == 0) {
4467                 mPropName.setTo(u"unassigned", -1);
4468                 negated = !negated;
4469             } else if (mPropName.caseCompare(u"TitleCase", -1, 0) == 0) {
4470                 mPropName.setTo(u"Titlecase_Letter", -1);
4471             }
4472 
4473             mPropName.insert(0, u"[\\p{", -1);
4474             mPropName.append(u"}]", -1);
4475             set.adoptInsteadAndCheckErrorCode(new UnicodeSet(mPropName, *fStatus), status);
4476 
4477             if (U_SUCCESS(status) && !set->isEmpty() && (usetFlags & USET_CASE_INSENSITIVE)) {
4478                 set->closeOver(USET_CASE_INSENSITIVE);
4479             }
4480             break;
4481 
4482         }
4483 
4484         if (propName.startsWith(u"java", -1)) {
4485             status = U_ZERO_ERROR;
4486             set.adoptInsteadAndCheckErrorCode(new UnicodeSet(), status);
4487             if (U_FAILURE(status)) {
4488                 break;
4489             }
4490             //
4491             //  Try the various Java specific properties.
4492             //   These all begin with "java"
4493             //
4494             if (propName.compare(u"javaDefined", -1) == 0) {
4495                 addCategory(set.getAlias(), U_GC_CN_MASK, status);
4496                 set->complement();
4497             }
4498             else if (propName.compare(u"javaDigit", -1) == 0) {
4499                 addCategory(set.getAlias(), U_GC_ND_MASK, status);
4500             }
4501             else if (propName.compare(u"javaIdentifierIgnorable", -1) == 0) {
4502                 addIdentifierIgnorable(set.getAlias(), status);
4503             }
4504             else if (propName.compare(u"javaISOControl", -1) == 0) {
4505                 set->add(0, 0x1F).add(0x7F, 0x9F);
4506             }
4507             else if (propName.compare(u"javaJavaIdentifierPart", -1) == 0) {
4508                 addCategory(set.getAlias(), U_GC_L_MASK, status);
4509                 addCategory(set.getAlias(), U_GC_SC_MASK, status);
4510                 addCategory(set.getAlias(), U_GC_PC_MASK, status);
4511                 addCategory(set.getAlias(), U_GC_ND_MASK, status);
4512                 addCategory(set.getAlias(), U_GC_NL_MASK, status);
4513                 addCategory(set.getAlias(), U_GC_MC_MASK, status);
4514                 addCategory(set.getAlias(), U_GC_MN_MASK, status);
4515                 addIdentifierIgnorable(set.getAlias(), status);
4516             }
4517             else if (propName.compare(u"javaJavaIdentifierStart", -1) == 0) {
4518                 addCategory(set.getAlias(), U_GC_L_MASK, status);
4519                 addCategory(set.getAlias(), U_GC_NL_MASK, status);
4520                 addCategory(set.getAlias(), U_GC_SC_MASK, status);
4521                 addCategory(set.getAlias(), U_GC_PC_MASK, status);
4522             }
4523             else if (propName.compare(u"javaLetter", -1) == 0) {
4524                 addCategory(set.getAlias(), U_GC_L_MASK, status);
4525             }
4526             else if (propName.compare(u"javaLetterOrDigit", -1) == 0) {
4527                 addCategory(set.getAlias(), U_GC_L_MASK, status);
4528                 addCategory(set.getAlias(), U_GC_ND_MASK, status);
4529             }
4530             else if (propName.compare(u"javaLowerCase", -1) == 0) {
4531                 addCategory(set.getAlias(), U_GC_LL_MASK, status);
4532             }
4533             else if (propName.compare(u"javaMirrored", -1) == 0) {
4534                 set->applyIntPropertyValue(UCHAR_BIDI_MIRRORED, 1, status);
4535             }
4536             else if (propName.compare(u"javaSpaceChar", -1) == 0) {
4537                 addCategory(set.getAlias(), U_GC_Z_MASK, status);
4538             }
4539             else if (propName.compare(u"javaSupplementaryCodePoint", -1) == 0) {
4540                 set->add(0x10000, UnicodeSet::MAX_VALUE);
4541             }
4542             else if (propName.compare(u"javaTitleCase", -1) == 0) {
4543                 addCategory(set.getAlias(), U_GC_LT_MASK, status);
4544             }
4545             else if (propName.compare(u"javaUnicodeIdentifierStart", -1) == 0) {
4546                 addCategory(set.getAlias(), U_GC_L_MASK, status);
4547                 addCategory(set.getAlias(), U_GC_NL_MASK, status);
4548             }
4549             else if (propName.compare(u"javaUnicodeIdentifierPart", -1) == 0) {
4550                 addCategory(set.getAlias(), U_GC_L_MASK, status);
4551                 addCategory(set.getAlias(), U_GC_PC_MASK, status);
4552                 addCategory(set.getAlias(), U_GC_ND_MASK, status);
4553                 addCategory(set.getAlias(), U_GC_NL_MASK, status);
4554                 addCategory(set.getAlias(), U_GC_MC_MASK, status);
4555                 addCategory(set.getAlias(), U_GC_MN_MASK, status);
4556                 addIdentifierIgnorable(set.getAlias(), status);
4557             }
4558             else if (propName.compare(u"javaUpperCase", -1) == 0) {
4559                 addCategory(set.getAlias(), U_GC_LU_MASK, status);
4560             }
4561             else if (propName.compare(u"javaValidCodePoint", -1) == 0) {
4562                 set->add(0, UnicodeSet::MAX_VALUE);
4563             }
4564             else if (propName.compare(u"javaWhitespace", -1) == 0) {
4565                 addCategory(set.getAlias(), U_GC_Z_MASK, status);
4566                 set->removeAll(UnicodeSet().add(0xa0).add(0x2007).add(0x202f));
4567                 set->add(9, 0x0d).add(0x1c, 0x1f);
4568             } else {
4569                 status = U_REGEX_PROPERTY_SYNTAX;
4570             }
4571 
4572             if (U_SUCCESS(status) && !set->isEmpty() && (usetFlags & USET_CASE_INSENSITIVE)) {
4573                 set->closeOver(USET_CASE_INSENSITIVE);
4574             }
4575             break;
4576         }
4577 
4578         // Unrecognized property. ICU didn't like it as it was, and none of the Java compatibility
4579         // extensions matched it.
4580         status = U_REGEX_PROPERTY_SYNTAX;
4581     } while (false);   // End of do loop block. Code above breaks out of the block on success or hard failure.
4582 
4583     if (U_SUCCESS(status)) {
4584         // ICU 70 adds emoji properties of strings, but as long as Java does not say how to
4585         // deal with properties of strings and character classes with strings, we ignore them.
4586         // Just in case something downstream might stumble over the strings,
4587         // we remove them from the set.
4588         // Note that when we support strings, the complement of a property (as with \P)
4589         // should be implemented as .complement().removeAllStrings() (code point complement).
4590         set->removeAllStrings();
4591         U_ASSERT(set.isValid());
4592         if (negated) {
4593             set->complement();
4594         }
4595         return set.orphan();
4596     } else {
4597         if (status == U_ILLEGAL_ARGUMENT_ERROR) {
4598             status = U_REGEX_PROPERTY_SYNTAX;
4599         }
4600         error(status);
4601         return nullptr;
4602     }
4603 }
4604 
4605 
4606 //
4607 //  SetEval   Part of the evaluation of [set expressions].
4608 //            Perform any pending (stacked) operations with precedence
4609 //            equal or greater to that of the next operator encountered
4610 //            in the expression.
4611 //
setEval(int32_t nextOp)4612 void RegexCompile::setEval(int32_t nextOp) {
4613     UnicodeSet *rightOperand = NULL;
4614     UnicodeSet *leftOperand  = NULL;
4615     for (;;) {
4616         U_ASSERT(fSetOpStack.empty()==FALSE);
4617         int32_t pendingSetOperation = fSetOpStack.peeki();
4618         if ((pendingSetOperation&0xffff0000) < (nextOp&0xffff0000)) {
4619             break;
4620         }
4621         fSetOpStack.popi();
4622         U_ASSERT(fSetStack.empty() == FALSE);
4623         rightOperand = (UnicodeSet *)fSetStack.peek();
4624         // ICU 70 adds emoji properties of strings, but createSetForProperty() removes all strings
4625         // (see comments there).
4626         // We also do not yet support string literals in character classes,
4627         // so there should not be any strings.
4628         // Note that when we support strings, the complement of a set (as with ^ or \P)
4629         // should be implemented as .complement().removeAllStrings() (code point complement).
4630         U_ASSERT(!rightOperand->hasStrings());
4631         switch (pendingSetOperation) {
4632             case setNegation:
4633                 rightOperand->complement();
4634                 break;
4635             case setCaseClose:
4636                 // TODO: need a simple close function.  Ticket 6065
4637                 rightOperand->closeOver(USET_CASE_INSENSITIVE);
4638                 rightOperand->removeAllStrings();
4639                 break;
4640             case setDifference1:
4641             case setDifference2:
4642                 fSetStack.pop();
4643                 leftOperand = (UnicodeSet *)fSetStack.peek();
4644                 leftOperand->removeAll(*rightOperand);
4645                 delete rightOperand;
4646                 break;
4647             case setIntersection1:
4648             case setIntersection2:
4649                 fSetStack.pop();
4650                 leftOperand = (UnicodeSet *)fSetStack.peek();
4651                 leftOperand->retainAll(*rightOperand);
4652                 delete rightOperand;
4653                 break;
4654             case setUnion:
4655                 fSetStack.pop();
4656                 leftOperand = (UnicodeSet *)fSetStack.peek();
4657                 leftOperand->addAll(*rightOperand);
4658                 delete rightOperand;
4659                 break;
4660             default:
4661                 UPRV_UNREACHABLE_EXIT;
4662             }
4663         }
4664     }
4665 
setPushOp(int32_t op)4666 void RegexCompile::setPushOp(int32_t op) {
4667     setEval(op);
4668     fSetOpStack.push(op, *fStatus);
4669     LocalPointer<UnicodeSet> lpSet(new UnicodeSet(), *fStatus);
4670     fSetStack.push(lpSet.orphan(), *fStatus);
4671 }
4672 
4673 U_NAMESPACE_END
4674 #endif  // !UCONFIG_NO_REGULAR_EXPRESSIONS
4675 
4676