• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 //
4 //  file:  regexcmp.cpp
5 //
6 //  Copyright (C) 2002-2016 International Business Machines Corporation and others.
7 //  All Rights Reserved.
8 //
9 //  This file contains the ICU regular expression compiler, which is responsible
10 //  for processing a regular expression pattern into the compiled form that
11 //  is used by the match finding engine.
12 //
13 
14 #include "unicode/utypes.h"
15 
16 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
17 
18 #include "unicode/ustring.h"
19 #include "unicode/unistr.h"
20 #include "unicode/uniset.h"
21 #include "unicode/uchar.h"
22 #include "unicode/uchriter.h"
23 #include "unicode/parsepos.h"
24 #include "unicode/parseerr.h"
25 #include "unicode/regex.h"
26 #include "unicode/utf.h"
27 #include "unicode/utf16.h"
28 #include "patternprops.h"
29 #include "putilimp.h"
30 #include "cmemory.h"
31 #include "cstr.h"
32 #include "cstring.h"
33 #include "uvectr32.h"
34 #include "uvectr64.h"
35 #include "uassert.h"
36 #include "uinvchar.h"
37 
38 #include "regeximp.h"
39 #include "regexcst.h"   // Contains state table for the regex pattern parser.
40                         //   generated by a Perl script.
41 #include "regexcmp.h"
42 #include "regexst.h"
43 #include "regextxt.h"
44 
45 
46 
47 U_NAMESPACE_BEGIN
48 
49 
50 //------------------------------------------------------------------------------
51 //
52 //  Constructor.
53 //
54 //------------------------------------------------------------------------------
RegexCompile(RegexPattern * rxp,UErrorCode & status)55 RegexCompile::RegexCompile(RegexPattern *rxp, UErrorCode &status) :
56    fParenStack(status), fSetStack(uprv_deleteUObject, nullptr, status), fSetOpStack(status)
57 {
58     // Lazy init of all shared global sets (needed for init()'s empty text)
59     RegexStaticSets::initGlobals(&status);
60 
61     fStatus           = &status;
62 
63     fRXPat            = rxp;
64     fScanIndex        = 0;
65     fLastChar         = -1;
66     fPeekChar         = -1;
67     fLineNum          = 1;
68     fCharNum          = 0;
69     fQuoteMode        = false;
70     fInBackslashQuote = false;
71     fModeFlags        = fRXPat->fFlags | 0x80000000;
72     fEOLComments      = true;
73 
74     fMatchOpenParen   = -1;
75     fMatchCloseParen  = -1;
76     fCaptureName      = nullptr;
77     fLastSetLiteral   = U_SENTINEL;
78 
79     if (U_SUCCESS(status) && U_FAILURE(rxp->fDeferredStatus)) {
80         status = rxp->fDeferredStatus;
81     }
82 }
83 
84 static const char16_t   chAmp       = 0x26;      // '&'
85 static const char16_t   chDash      = 0x2d;      // '-'
86 
87 
88 //------------------------------------------------------------------------------
89 //
90 //  Destructor
91 //
92 //------------------------------------------------------------------------------
~RegexCompile()93 RegexCompile::~RegexCompile() {
94     delete fCaptureName;         // Normally will be nullptr, but can exist if pattern
95                                  //   compilation stops with a syntax error.
96 }
97 
addCategory(UnicodeSet * set,int32_t value,UErrorCode & ec)98 static inline void addCategory(UnicodeSet *set, int32_t value, UErrorCode& ec) {
99     set->addAll(UnicodeSet().applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, value, ec));
100 }
101 
102 //------------------------------------------------------------------------------
103 //
104 //  Compile regex pattern.   The state machine for rexexp pattern parsing is here.
105 //                           The state tables are hand-written in the file regexcst.txt,
106 //                           and converted to the form used here by a perl
107 //                           script regexcst.pl
108 //
109 //------------------------------------------------------------------------------
compile(const UnicodeString & pat,UParseError & pp,UErrorCode & e)110 void    RegexCompile::compile(
111                          const UnicodeString &pat,   // Source pat to be compiled.
112                          UParseError &pp,            // Error position info
113                          UErrorCode &e)              // Error Code
114 {
115     fRXPat->fPatternString = new UnicodeString(pat);
116     UText patternText = UTEXT_INITIALIZER;
117     utext_openConstUnicodeString(&patternText, fRXPat->fPatternString, &e);
118 
119     if (U_SUCCESS(e)) {
120         compile(&patternText, pp, e);
121         utext_close(&patternText);
122     }
123 }
124 
125 //
126 //   compile, UText mode
127 //     All the work is actually done here.
128 //
compile(UText * pat,UParseError & pp,UErrorCode & e)129 void    RegexCompile::compile(
130                          UText *pat,                 // Source pat to be compiled.
131                          UParseError &pp,            // Error position info
132                          UErrorCode &e)              // Error Code
133 {
134     fStatus             = &e;
135     fParseErr           = &pp;
136     fStackPtr           = 0;
137     fStack[fStackPtr]   = 0;
138 
139     if (U_FAILURE(*fStatus)) {
140         return;
141     }
142 
143     // There should be no pattern stuff in the RegexPattern object.  They can not be reused.
144     U_ASSERT(fRXPat->fPattern == nullptr || utext_nativeLength(fRXPat->fPattern) == 0);
145 
146     // Prepare the RegexPattern object to receive the compiled pattern.
147     fRXPat->fPattern        = utext_clone(fRXPat->fPattern, pat, false, true, fStatus);
148     if (U_FAILURE(*fStatus)) {
149         return;
150     }
151 
152     // Initialize the pattern scanning state machine
153     fPatternLength = utext_nativeLength(pat);
154     uint16_t                state = 1;
155     const RegexTableEl      *tableEl;
156 
157     // UREGEX_LITERAL force entire pattern to be treated as a literal string.
158     if (fModeFlags & UREGEX_LITERAL) {
159         fQuoteMode = true;
160     }
161 
162     nextChar(fC);                        // Fetch the first char from the pattern string.
163 
164     //
165     // Main loop for the regex pattern parsing state machine.
166     //   Runs once per state transition.
167     //   Each time through optionally performs, depending on the state table,
168     //      - an advance to the the next pattern char
169     //      - an action to be performed.
170     //      - pushing or popping a state to/from the local state return stack.
171     //   file regexcst.txt is the source for the state table.  The logic behind
172     //     recongizing the pattern syntax is there, not here.
173     //
174     for (;;) {
175         //  Bail out if anything has gone wrong.
176         //  Regex pattern parsing stops on the first error encountered.
177         if (U_FAILURE(*fStatus)) {
178             break;
179         }
180 
181         U_ASSERT(state != 0);
182 
183         // Find the state table element that matches the input char from the pattern, or the
184         //    class of the input character.  Start with the first table row for this
185         //    state, then linearly scan forward until we find a row that matches the
186         //    character.  The last row for each state always matches all characters, so
187         //    the search will stop there, if not before.
188         //
189         tableEl = &gRuleParseStateTable[state];
190         REGEX_SCAN_DEBUG_PRINTF(("char, line, col = (\'%c\', %d, %d)    state=%s ",
191             fC.fChar, fLineNum, fCharNum, RegexStateNames[state]));
192 
193         for (;;) {    // loop through table rows belonging to this state, looking for one
194                       //   that matches the current input char.
195             REGEX_SCAN_DEBUG_PRINTF(("."));
196             if (tableEl->fCharClass < 127 && fC.fQuoted == false &&   tableEl->fCharClass == fC.fChar) {
197                 // Table row specified an individual character, not a set, and
198                 //   the input character is not quoted, and
199                 //   the input character matched it.
200                 break;
201             }
202             if (tableEl->fCharClass == 255) {
203                 // Table row specified default, match anything character class.
204                 break;
205             }
206             if (tableEl->fCharClass == 254 && fC.fQuoted)  {
207                 // Table row specified "quoted" and the char was quoted.
208                 break;
209             }
210             if (tableEl->fCharClass == 253 && fC.fChar == (UChar32)-1)  {
211                 // Table row specified eof and we hit eof on the input.
212                 break;
213             }
214 
215             if (tableEl->fCharClass >= 128 && tableEl->fCharClass < 240 &&   // Table specs a char class &&
216                 fC.fQuoted == false &&                                       //   char is not escaped &&
217                 fC.fChar != (UChar32)-1) {                                   //   char is not EOF
218                 U_ASSERT(tableEl->fCharClass <= 137);
219                 if (RegexStaticSets::gStaticSets->fRuleSets[tableEl->fCharClass-128].contains(fC.fChar)) {
220                     // Table row specified a character class, or set of characters,
221                     //   and the current char matches it.
222                     break;
223                 }
224             }
225 
226             // No match on this row, advance to the next  row for this state,
227             tableEl++;
228         }
229         REGEX_SCAN_DEBUG_PRINTF(("\n"));
230 
231         //
232         // We've found the row of the state table that matches the current input
233         //   character from the rules string.
234         // Perform any action specified  by this row in the state table.
235         if (doParseActions(tableEl->fAction) == false) {
236             // Break out of the state machine loop if the
237             //   the action signalled some kind of error, or
238             //   the action was to exit, occurs on normal end-of-rules-input.
239             break;
240         }
241 
242         if (tableEl->fPushState != 0) {
243             fStackPtr++;
244             if (fStackPtr >= kStackSize) {
245                 error(U_REGEX_INTERNAL_ERROR);
246                 REGEX_SCAN_DEBUG_PRINTF(("RegexCompile::parse() - state stack overflow.\n"));
247                 fStackPtr--;
248             }
249             fStack[fStackPtr] = tableEl->fPushState;
250         }
251 
252         //
253         //  NextChar.  This is where characters are actually fetched from the pattern.
254         //             Happens under control of the 'n' tag in the state table.
255         //
256         if (tableEl->fNextChar) {
257             nextChar(fC);
258         }
259 
260         // Get the next state from the table entry, or from the
261         //   state stack if the next state was specified as "pop".
262         if (tableEl->fNextState != 255) {
263             state = tableEl->fNextState;
264         } else {
265             state = fStack[fStackPtr];
266             fStackPtr--;
267             if (fStackPtr < 0) {
268                 // state stack underflow
269                 // This will occur if the user pattern has mis-matched parentheses,
270                 //   with extra close parens.
271                 //
272                 fStackPtr++;
273                 error(U_REGEX_MISMATCHED_PAREN);
274             }
275         }
276 
277     }
278 
279     if (U_FAILURE(*fStatus)) {
280         // Bail out if the pattern had errors.
281         return;
282     }
283 
284     //
285     // The pattern has now been read and processed, and the compiled code generated.
286     //
287 
288     //
289     // The pattern's fFrameSize so far has accumulated the requirements for
290     //   storage for capture parentheses, counters, etc. that are encountered
291     //   in the pattern.  Add space for the two variables that are always
292     //   present in the saved state:  the input string position (int64_t) and
293     //   the position in the compiled pattern.
294     //
295     allocateStackData(RESTACKFRAME_HDRCOUNT);
296 
297     //
298     // Optimization pass 1: NOPs, back-references, and case-folding
299     //
300     stripNOPs();
301 
302     //
303     // Get bounds for the minimum and maximum length of a string that this
304     //   pattern can match.  Used to avoid looking for matches in strings that
305     //   are too short.
306     //
307     fRXPat->fMinMatchLen = minMatchLength(3, fRXPat->fCompiledPat->size()-1);
308 
309     //
310     // Optimization pass 2: match start type
311     //
312     matchStartType();
313 
314     //
315     // Set up fast latin-1 range sets
316     //
317     int32_t numSets = fRXPat->fSets->size();
318     fRXPat->fSets8 = new Regex8BitSet[numSets];
319     // Null pointer check.
320     if (fRXPat->fSets8 == nullptr) {
321         e = *fStatus = U_MEMORY_ALLOCATION_ERROR;
322         return;
323     }
324     int32_t i;
325     for (i=0; i<numSets; i++) {
326         UnicodeSet *s = (UnicodeSet *)fRXPat->fSets->elementAt(i);
327         fRXPat->fSets8[i].init(s);
328     }
329 
330 }
331 
332 
333 
334 
335 
336 //------------------------------------------------------------------------------
337 //
338 //  doParseAction        Do some action during regex pattern parsing.
339 //                       Called by the parse state machine.
340 //
341 //                       Generation of the match engine PCode happens here, or
342 //                       in functions called from the parse actions defined here.
343 //
344 //
345 //------------------------------------------------------------------------------
doParseActions(int32_t action)346 UBool RegexCompile::doParseActions(int32_t action)
347 {
348     UBool   returnVal = true;
349 
350     switch ((Regex_PatternParseAction)action) {
351 
352     case doPatStart:
353         // Start of pattern compiles to:
354         //0   SAVE   2        Fall back to position of FAIL
355         //1   jmp    3
356         //2   FAIL            Stop if we ever reach here.
357         //3   NOP             Dummy, so start of pattern looks the same as
358         //                    the start of an ( grouping.
359         //4   NOP             Resreved, will be replaced by a save if there are
360         //                    OR | operators at the top level
361         appendOp(URX_STATE_SAVE, 2);
362         appendOp(URX_JMP,  3);
363         appendOp(URX_FAIL, 0);
364 
365         // Standard open nonCapture paren action emits the two NOPs and
366         //   sets up the paren stack frame.
367         doParseActions(doOpenNonCaptureParen);
368         break;
369 
370     case doPatFinish:
371         // We've scanned to the end of the pattern
372         //  The end of pattern compiles to:
373         //        URX_END
374         //    which will stop the runtime match engine.
375         //  Encountering end of pattern also behaves like a close paren,
376         //   and forces fixups of the State Save at the beginning of the compiled pattern
377         //   and of any OR operations at the top level.
378         //
379         handleCloseParen();
380         if (fParenStack.size() > 0) {
381             // Missing close paren in pattern.
382             error(U_REGEX_MISMATCHED_PAREN);
383         }
384 
385         // add the END operation to the compiled pattern.
386         appendOp(URX_END, 0);
387 
388         // Terminate the pattern compilation state machine.
389         returnVal = false;
390         break;
391 
392 
393 
394     case doOrOperator:
395         // Scanning a '|', as in (A|B)
396         {
397             // Generate code for any pending literals preceding the '|'
398             fixLiterals(false);
399 
400             // Insert a SAVE operation at the start of the pattern section preceding
401             //   this OR at this level.  This SAVE will branch the match forward
402             //   to the right hand side of the OR in the event that the left hand
403             //   side fails to match and backtracks.  Locate the position for the
404             //   save from the location on the top of the parentheses stack.
405             int32_t savePosition = fParenStack.popi();
406             int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(savePosition);
407             U_ASSERT(URX_TYPE(op) == URX_NOP);  // original contents of reserved location
408             op = buildOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+1);
409             fRXPat->fCompiledPat->setElementAt(op, savePosition);
410 
411             // Append an JMP operation into the compiled pattern.  The operand for
412             //  the JMP will eventually be the location following the ')' for the
413             //  group.  This will be patched in later, when the ')' is encountered.
414             appendOp(URX_JMP, 0);
415 
416             // Push the position of the newly added JMP op onto the parentheses stack.
417             // This registers if for fixup when this block's close paren is encountered.
418             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);
419 
420             // Append a NOP to the compiled pattern.  This is the slot reserved
421             //   for a SAVE in the event that there is yet another '|' following
422             //   this one.
423             appendOp(URX_NOP, 0);
424             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);
425         }
426         break;
427 
428 
429     case doBeginNamedCapture:
430         // Scanning (?<letter.
431         //   The first letter of the name will come through again under doConinueNamedCapture.
432         fCaptureName = new UnicodeString();
433         if (fCaptureName == nullptr) {
434             error(U_MEMORY_ALLOCATION_ERROR);
435         }
436         break;
437 
438     case  doContinueNamedCapture:
439         fCaptureName->append(fC.fChar);
440         break;
441 
442     case doBadNamedCapture:
443         error(U_REGEX_INVALID_CAPTURE_GROUP_NAME);
444         break;
445 
446     case doOpenCaptureParen:
447         // Open Capturing Paren, possibly named.
448         //   Compile to a
449         //      - NOP, which later may be replaced by a save-state if the
450         //         parenthesized group gets a * quantifier, followed by
451         //      - START_CAPTURE  n    where n is stack frame offset to the capture group variables.
452         //      - NOP, which may later be replaced by a save-state if there
453         //             is an '|' alternation within the parens.
454         //
455         //    Each capture group gets three slots in the save stack frame:
456         //         0: Capture Group start position (in input string being matched.)
457         //         1: Capture Group end position.
458         //         2: Start of Match-in-progress.
459         //    The first two locations are for a completed capture group, and are
460         //     referred to by back references and the like.
461         //    The third location stores the capture start position when an START_CAPTURE is
462         //      encountered.  This will be promoted to a completed capture when (and if) the corresponding
463         //      END_CAPTURE is encountered.
464         {
465             fixLiterals();
466             appendOp(URX_NOP, 0);
467             int32_t  varsLoc = allocateStackData(3);    // Reserve three slots in match stack frame.
468             appendOp(URX_START_CAPTURE, varsLoc);
469             appendOp(URX_NOP, 0);
470 
471             // On the Parentheses stack, start a new frame and add the positions
472             //   of the two NOPs.  Depending on what follows in the pattern, the
473             //   NOPs may be changed to SAVE_STATE or JMP ops, with a target
474             //   address of the end of the parenthesized group.
475             fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
476             fParenStack.push(capturing, *fStatus);                        // Frame type.
477             fParenStack.push(fRXPat->fCompiledPat->size()-3, *fStatus);   // The first  NOP location
478             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP loc
479 
480             // Save the mapping from group number to stack frame variable position.
481             fRXPat->fGroupMap->addElement(varsLoc, *fStatus);
482 
483             // If this is a named capture group, add the name->group number mapping.
484             if (fCaptureName != nullptr) {
485                 if (!fRXPat->initNamedCaptureMap()) {
486                     if (U_SUCCESS(*fStatus)) {
487                         error(fRXPat->fDeferredStatus);
488                     }
489                     break;
490                 }
491                 int32_t groupNumber = fRXPat->fGroupMap->size();
492                 int32_t previousMapping = uhash_puti(fRXPat->fNamedCaptureMap, fCaptureName, groupNumber, fStatus);
493                 fCaptureName = nullptr;    // hash table takes ownership of the name (key) string.
494                 if (previousMapping > 0 && U_SUCCESS(*fStatus)) {
495                     error(U_REGEX_INVALID_CAPTURE_GROUP_NAME);
496                 }
497             }
498         }
499         break;
500 
501     case doOpenNonCaptureParen:
502         // Open non-caputuring (grouping only) Paren.
503         //   Compile to a
504         //      - NOP, which later may be replaced by a save-state if the
505         //         parenthesized group gets a * quantifier, followed by
506         //      - NOP, which may later be replaced by a save-state if there
507         //             is an '|' alternation within the parens.
508         {
509             fixLiterals();
510             appendOp(URX_NOP, 0);
511             appendOp(URX_NOP, 0);
512 
513             // On the Parentheses stack, start a new frame and add the positions
514             //   of the two NOPs.
515             fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
516             fParenStack.push(plain,      *fStatus);                       // Begin a new frame.
517             fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The first  NOP location
518             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP loc
519         }
520          break;
521 
522 
523     case doOpenAtomicParen:
524         // Open Atomic Paren.  (?>
525         //   Compile to a
526         //      - NOP, which later may be replaced if the parenthesized group
527         //         has a quantifier, followed by
528         //      - STO_SP  save state stack position, so it can be restored at the ")"
529         //      - NOP, which may later be replaced by a save-state if there
530         //             is an '|' alternation within the parens.
531         {
532             fixLiterals();
533             appendOp(URX_NOP, 0);
534             int32_t  varLoc = allocateData(1);    // Reserve a data location for saving the state stack ptr.
535             appendOp(URX_STO_SP, varLoc);
536             appendOp(URX_NOP, 0);
537 
538             // On the Parentheses stack, start a new frame and add the positions
539             //   of the two NOPs.  Depending on what follows in the pattern, the
540             //   NOPs may be changed to SAVE_STATE or JMP ops, with a target
541             //   address of the end of the parenthesized group.
542             fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
543             fParenStack.push(atomic, *fStatus);                           // Frame type.
544             fParenStack.push(fRXPat->fCompiledPat->size()-3, *fStatus);   // The first NOP
545             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP
546         }
547         break;
548 
549 
550     case doOpenLookAhead:
551         // Positive Look-ahead   (?=  stuff  )
552         //
553         //   Note:   Addition of transparent input regions, with the need to
554         //           restore the original regions when failing out of a lookahead
555         //           block, complicated this sequence.  Some combined opcodes
556         //           might make sense - or might not, lookahead aren't that common.
557         //
558         //      Caution:  min match length optimization knows about this
559         //               sequence; don't change without making updates there too.
560         //
561         // Compiles to
562         //    1    LA_START     dataLoc     Saves SP, Input Pos, Active input region.
563         //    2.   STATE_SAVE   4            on failure of lookahead, goto 4
564         //    3    JMP          6           continue ...
565         //
566         //    4.   LA_END                   Look Ahead failed.  Restore regions.
567         //    5.   BACKTRACK                and back track again.
568         //
569         //    6.   NOP              reserved for use by quantifiers on the block.
570         //                          Look-ahead can't have quantifiers, but paren stack
571         //                             compile time conventions require the slot anyhow.
572         //    7.   NOP              may be replaced if there is are '|' ops in the block.
573         //    8.     code for parenthesized stuff.
574         //    9.   LA_END
575         //
576         //  Four data slots are reserved, for saving state on entry to the look-around
577         //    0:   stack pointer on entry.
578         //    1:   input position on entry.
579         //    2:   fActiveStart, the active bounds start on entry.
580         //    3:   fActiveLimit, the active bounds limit on entry.
581         {
582             fixLiterals();
583             int32_t dataLoc = allocateData(4);
584             appendOp(URX_LA_START, dataLoc);
585             appendOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+ 2);
586             appendOp(URX_JMP, fRXPat->fCompiledPat->size()+ 3);
587             appendOp(URX_LA_END, dataLoc);
588             appendOp(URX_BACKTRACK, 0);
589             appendOp(URX_NOP, 0);
590             appendOp(URX_NOP, 0);
591 
592             // On the Parentheses stack, start a new frame and add the positions
593             //   of the NOPs.
594             fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
595             fParenStack.push(lookAhead, *fStatus);                        // Frame type.
596             fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The first  NOP location
597             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP location
598         }
599         break;
600 
601     case doOpenLookAheadNeg:
602         // Negated Lookahead.   (?! stuff )
603         // Compiles to
604         //    1.    LA_START    dataloc
605         //    2.    SAVE_STATE  7         // Fail within look-ahead block restores to this state,
606         //                                //   which continues with the match.
607         //    3.    NOP                   // Std. Open Paren sequence, for possible '|'
608         //    4.       code for parenthesized stuff.
609         //    5.    LA_END                // Cut back stack, remove saved state from step 2.
610         //    6.    BACKTRACK             // code in block succeeded, so neg. lookahead fails.
611         //    7.    END_LA                // Restore match region, in case look-ahead was using
612         //                                        an alternate (transparent) region.
613         //  Four data slots are reserved, for saving state on entry to the look-around
614         //    0:   stack pointer on entry.
615         //    1:   input position on entry.
616         //    2:   fActiveStart, the active bounds start on entry.
617         //    3:   fActiveLimit, the active bounds limit on entry.
618         {
619             fixLiterals();
620             int32_t dataLoc = allocateData(4);
621             appendOp(URX_LA_START, dataLoc);
622             appendOp(URX_STATE_SAVE, 0);    // dest address will be patched later.
623             appendOp(URX_NOP, 0);
624 
625             // On the Parentheses stack, start a new frame and add the positions
626             //   of the StateSave and NOP.
627             fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
628             fParenStack.push(negLookAhead, *fStatus);                    // Frame type
629             fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The STATE_SAVE location
630             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP location
631 
632             // Instructions #5 - #7 will be added when the ')' is encountered.
633         }
634         break;
635 
636     case doOpenLookBehind:
637         {
638             //   Compile a (?<= look-behind open paren.
639             //
640             //          Compiles to
641             //              0       URX_LB_START     dataLoc
642             //              1       URX_LB_CONT      dataLoc
643             //              2                        MinMatchLen
644             //              3                        MaxMatchLen
645             //              4       URX_NOP          Standard '(' boilerplate.
646             //              5       URX_NOP          Reserved slot for use with '|' ops within (block).
647             //              6         <code for LookBehind expression>
648             //              7       URX_LB_END       dataLoc    # Check match len, restore input  len
649             //              8       URX_LA_END       dataLoc    # Restore stack, input pos
650             //
651             //          Allocate a block of matcher data, to contain (when running a match)
652             //              0:    Stack ptr on entry
653             //              1:    Input Index on entry
654             //              2:    fActiveStart, the active bounds start on entry.
655             //              3:    fActiveLimit, the active bounds limit on entry.
656             //              4:    Start index of match current match attempt.
657             //          The first four items must match the layout of data for LA_START / LA_END
658 
659             // Generate match code for any pending literals.
660             fixLiterals();
661 
662             // Allocate data space
663             int32_t dataLoc = allocateData(5);
664 
665             // Emit URX_LB_START
666             appendOp(URX_LB_START, dataLoc);
667 
668             // Emit URX_LB_CONT
669             appendOp(URX_LB_CONT, dataLoc);
670             appendOp(URX_RESERVED_OP, 0);    // MinMatchLength.  To be filled later.
671             appendOp(URX_RESERVED_OP, 0);    // MaxMatchLength.  To be filled later.
672 
673             // Emit the NOPs
674             appendOp(URX_NOP, 0);
675             appendOp(URX_NOP, 0);
676 
677             // On the Parentheses stack, start a new frame and add the positions
678             //   of the URX_LB_CONT and the NOP.
679             fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
680             fParenStack.push(lookBehind, *fStatus);                       // Frame type
681             fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The first NOP location
682             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The 2nd   NOP location
683 
684             // The final two instructions will be added when the ')' is encountered.
685         }
686 
687         break;
688 
689     case doOpenLookBehindNeg:
690         {
691             //   Compile a (?<! negated look-behind open paren.
692             //
693             //          Compiles to
694             //              0       URX_LB_START     dataLoc    # Save entry stack, input len
695             //              1       URX_LBN_CONT     dataLoc    # Iterate possible match positions
696             //              2                        MinMatchLen
697             //              3                        MaxMatchLen
698             //              4                        continueLoc (9)
699             //              5       URX_NOP          Standard '(' boilerplate.
700             //              6       URX_NOP          Reserved slot for use with '|' ops within (block).
701             //              7         <code for LookBehind expression>
702             //              8       URX_LBN_END      dataLoc    # Check match len, cause a FAIL
703             //              9       ...
704             //
705             //          Allocate a block of matcher data, to contain (when running a match)
706             //              0:    Stack ptr on entry
707             //              1:    Input Index on entry
708             //              2:    fActiveStart, the active bounds start on entry.
709             //              3:    fActiveLimit, the active bounds limit on entry.
710             //              4:    Start index of match current match attempt.
711             //          The first four items must match the layout of data for LA_START / LA_END
712 
713             // Generate match code for any pending literals.
714             fixLiterals();
715 
716             // Allocate data space
717             int32_t dataLoc = allocateData(5);
718 
719             // Emit URX_LB_START
720             appendOp(URX_LB_START, dataLoc);
721 
722             // Emit URX_LBN_CONT
723             appendOp(URX_LBN_CONT, dataLoc);
724             appendOp(URX_RESERVED_OP, 0);    // MinMatchLength.  To be filled later.
725             appendOp(URX_RESERVED_OP, 0);    // MaxMatchLength.  To be filled later.
726             appendOp(URX_RESERVED_OP, 0);    // Continue Loc.    To be filled later.
727 
728             // Emit the NOPs
729             appendOp(URX_NOP, 0);
730             appendOp(URX_NOP, 0);
731 
732             // On the Parentheses stack, start a new frame and add the positions
733             //   of the URX_LB_CONT and the NOP.
734             fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
735             fParenStack.push(lookBehindN, *fStatus);                      // Frame type
736             fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The first NOP location
737             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The 2nd   NOP location
738 
739             // The final two instructions will be added when the ')' is encountered.
740         }
741         break;
742 
743     case doConditionalExpr:
744         // Conditionals such as (?(1)a:b)
745     case doPerlInline:
746         // Perl inline-conditionals.  (?{perl code}a|b) We're not perl, no way to do them.
747         error(U_REGEX_UNIMPLEMENTED);
748         break;
749 
750 
751     case doCloseParen:
752         handleCloseParen();
753         if (fParenStack.size() <= 0) {
754             //  Extra close paren, or missing open paren.
755             error(U_REGEX_MISMATCHED_PAREN);
756         }
757         break;
758 
759     case doNOP:
760         break;
761 
762 
763     case doBadOpenParenType:
764     case doRuleError:
765         error(U_REGEX_RULE_SYNTAX);
766         break;
767 
768 
769     case doMismatchedParenErr:
770         error(U_REGEX_MISMATCHED_PAREN);
771         break;
772 
773     case doPlus:
774         //  Normal '+'  compiles to
775         //     1.   stuff to be repeated  (already built)
776         //     2.   jmp-sav 1
777         //     3.   ...
778         //
779         //  Or, if the item to be repeated can match a zero length string,
780         //     1.   STO_INP_LOC  data-loc
781         //     2.      body of stuff to be repeated
782         //     3.   JMP_SAV_X    2
783         //     4.   ...
784 
785         //
786         //  Or, if the item to be repeated is simple
787         //     1.   Item to be repeated.
788         //     2.   LOOP_SR_I    set number  (assuming repeated item is a set ref)
789         //     3.   LOOP_C       stack location
790         {
791             int32_t  topLoc = blockTopLoc(false);        // location of item #1
792             int32_t  frameLoc;
793 
794             // Check for simple constructs, which may get special optimized code.
795             if (topLoc == fRXPat->fCompiledPat->size() - 1) {
796                 int32_t repeatedOp = (int32_t)fRXPat->fCompiledPat->elementAti(topLoc);
797 
798                 if (URX_TYPE(repeatedOp) == URX_SETREF) {
799                     // Emit optimized code for [char set]+
800                     appendOp(URX_LOOP_SR_I, URX_VAL(repeatedOp));
801                     frameLoc = allocateStackData(1);
802                     appendOp(URX_LOOP_C, frameLoc);
803                     break;
804                 }
805 
806                 if (URX_TYPE(repeatedOp) == URX_DOTANY ||
807                     URX_TYPE(repeatedOp) == URX_DOTANY_ALL ||
808                     URX_TYPE(repeatedOp) == URX_DOTANY_UNIX) {
809                     // Emit Optimized code for .+ operations.
810                     int32_t loopOpI = buildOp(URX_LOOP_DOT_I, 0);
811                     if (URX_TYPE(repeatedOp) == URX_DOTANY_ALL) {
812                         // URX_LOOP_DOT_I operand is a flag indicating ". matches any" mode.
813                         loopOpI |= 1;
814                     }
815                     if (fModeFlags & UREGEX_UNIX_LINES) {
816                         loopOpI |= 2;
817                     }
818                     appendOp(loopOpI);
819                     frameLoc = allocateStackData(1);
820                     appendOp(URX_LOOP_C, frameLoc);
821                     break;
822                 }
823 
824             }
825 
826             // General case.
827 
828             // Check for minimum match length of zero, which requires
829             //    extra loop-breaking code.
830             if (minMatchLength(topLoc, fRXPat->fCompiledPat->size()-1) == 0) {
831                 // Zero length match is possible.
832                 // Emit the code sequence that can handle it.
833                 insertOp(topLoc);
834                 frameLoc = allocateStackData(1);
835 
836                 int32_t op = buildOp(URX_STO_INP_LOC, frameLoc);
837                 fRXPat->fCompiledPat->setElementAt(op, topLoc);
838 
839                 appendOp(URX_JMP_SAV_X, topLoc+1);
840             } else {
841                 // Simpler code when the repeated body must match something non-empty
842                 appendOp(URX_JMP_SAV, topLoc);
843             }
844         }
845         break;
846 
847     case doNGPlus:
848         //  Non-greedy '+?'  compiles to
849         //     1.   stuff to be repeated  (already built)
850         //     2.   state-save  1
851         //     3.   ...
852         {
853             int32_t topLoc      = blockTopLoc(false);
854             appendOp(URX_STATE_SAVE, topLoc);
855         }
856         break;
857 
858 
859     case doOpt:
860         // Normal (greedy) ? quantifier.
861         //  Compiles to
862         //     1. state save 3
863         //     2.    body of optional block
864         //     3. ...
865         // Insert the state save into the compiled pattern, and we're done.
866         {
867             int32_t   saveStateLoc = blockTopLoc(true);
868             int32_t   saveStateOp  = buildOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size());
869             fRXPat->fCompiledPat->setElementAt(saveStateOp, saveStateLoc);
870         }
871         break;
872 
873     case doNGOpt:
874         // Non-greedy ?? quantifier
875         //   compiles to
876         //    1.  jmp   4
877         //    2.     body of optional block
878         //    3   jmp   5
879         //    4.  state save 2
880         //    5    ...
881         //  This code is less than ideal, with two jmps instead of one, because we can only
882         //  insert one instruction at the top of the block being iterated.
883         {
884             int32_t  jmp1_loc = blockTopLoc(true);
885             int32_t  jmp2_loc = fRXPat->fCompiledPat->size();
886 
887             int32_t  jmp1_op  = buildOp(URX_JMP, jmp2_loc+1);
888             fRXPat->fCompiledPat->setElementAt(jmp1_op, jmp1_loc);
889 
890             appendOp(URX_JMP, jmp2_loc+2);
891 
892             appendOp(URX_STATE_SAVE, jmp1_loc+1);
893         }
894         break;
895 
896 
897     case doStar:
898         // Normal (greedy) * quantifier.
899         // Compiles to
900         //       1.   STATE_SAVE   4
901         //       2.      body of stuff being iterated over
902         //       3.   JMP_SAV      2
903         //       4.   ...
904         //
905         // Or, if the body is a simple [Set],
906         //       1.   LOOP_SR_I    set number
907         //       2.   LOOP_C       stack location
908         //       ...
909         //
910         // Or if this is a .*
911         //       1.   LOOP_DOT_I    (. matches all mode flag)
912         //       2.   LOOP_C        stack location
913         //
914         // Or, if the body can match a zero-length string, to inhibit infinite loops,
915         //       1.   STATE_SAVE   5
916         //       2.   STO_INP_LOC  data-loc
917         //       3.      body of stuff
918         //       4.   JMP_SAV_X    2
919         //       5.   ...
920         {
921             // location of item #1, the STATE_SAVE
922             int32_t   topLoc = blockTopLoc(false);
923             int32_t   dataLoc = -1;
924 
925             // Check for simple *, where the construct being repeated
926             //   compiled to single opcode, and might be optimizable.
927             if (topLoc == fRXPat->fCompiledPat->size() - 1) {
928                 int32_t repeatedOp = (int32_t)fRXPat->fCompiledPat->elementAti(topLoc);
929 
930                 if (URX_TYPE(repeatedOp) == URX_SETREF) {
931                     // Emit optimized code for a [char set]*
932                     int32_t loopOpI = buildOp(URX_LOOP_SR_I, URX_VAL(repeatedOp));
933                     fRXPat->fCompiledPat->setElementAt(loopOpI, topLoc);
934                     dataLoc = allocateStackData(1);
935                     appendOp(URX_LOOP_C, dataLoc);
936                     break;
937                 }
938 
939                 if (URX_TYPE(repeatedOp) == URX_DOTANY ||
940                     URX_TYPE(repeatedOp) == URX_DOTANY_ALL ||
941                     URX_TYPE(repeatedOp) == URX_DOTANY_UNIX) {
942                     // Emit Optimized code for .* operations.
943                     int32_t loopOpI = buildOp(URX_LOOP_DOT_I, 0);
944                     if (URX_TYPE(repeatedOp) == URX_DOTANY_ALL) {
945                         // URX_LOOP_DOT_I operand is a flag indicating . matches any mode.
946                         loopOpI |= 1;
947                     }
948                     if ((fModeFlags & UREGEX_UNIX_LINES) != 0) {
949                         loopOpI |= 2;
950                     }
951                     fRXPat->fCompiledPat->setElementAt(loopOpI, topLoc);
952                     dataLoc = allocateStackData(1);
953                     appendOp(URX_LOOP_C, dataLoc);
954                     break;
955                 }
956             }
957 
958             // Emit general case code for this *
959             // The optimizations did not apply.
960 
961             int32_t   saveStateLoc = blockTopLoc(true);
962             int32_t   jmpOp        = buildOp(URX_JMP_SAV, saveStateLoc+1);
963 
964             // Check for minimum match length of zero, which requires
965             //    extra loop-breaking code.
966             if (minMatchLength(saveStateLoc, fRXPat->fCompiledPat->size()-1) == 0) {
967                 insertOp(saveStateLoc);
968                 dataLoc = allocateStackData(1);
969 
970                 int32_t op = buildOp(URX_STO_INP_LOC, dataLoc);
971                 fRXPat->fCompiledPat->setElementAt(op, saveStateLoc+1);
972                 jmpOp      = buildOp(URX_JMP_SAV_X, saveStateLoc+2);
973             }
974 
975             // Locate the position in the compiled pattern where the match will continue
976             //   after completing the *.   (4 or 5 in the comment above)
977             int32_t continueLoc = fRXPat->fCompiledPat->size()+1;
978 
979             // Put together the save state op and store it into the compiled code.
980             int32_t saveStateOp = buildOp(URX_STATE_SAVE, continueLoc);
981             fRXPat->fCompiledPat->setElementAt(saveStateOp, saveStateLoc);
982 
983             // Append the URX_JMP_SAV or URX_JMPX operation to the compiled pattern.
984             appendOp(jmpOp);
985         }
986         break;
987 
988     case doNGStar:
989         // Non-greedy *? quantifier
990         // compiles to
991         //     1.   JMP    3
992         //     2.      body of stuff being iterated over
993         //     3.   STATE_SAVE  2
994         //     4    ...
995         {
996             int32_t     jmpLoc  = blockTopLoc(true);                   // loc  1.
997             int32_t     saveLoc = fRXPat->fCompiledPat->size();        // loc  3.
998             int32_t     jmpOp   = buildOp(URX_JMP, saveLoc);
999             fRXPat->fCompiledPat->setElementAt(jmpOp, jmpLoc);
1000             appendOp(URX_STATE_SAVE, jmpLoc+1);
1001         }
1002         break;
1003 
1004 
1005     case doIntervalInit:
1006         // The '{' opening an interval quantifier was just scanned.
1007         // Init the counter variables that will accumulate the values as the digits
1008         //    are scanned.
1009         fIntervalLow = 0;
1010         fIntervalUpper = -1;
1011         break;
1012 
1013     case doIntevalLowerDigit:
1014         // Scanned a digit from the lower value of an {lower,upper} interval
1015         {
1016             int32_t digitValue = u_charDigitValue(fC.fChar);
1017             U_ASSERT(digitValue >= 0);
1018             int64_t val = (int64_t)fIntervalLow*10 + digitValue;
1019             if (val > INT32_MAX) {
1020                 error(U_REGEX_NUMBER_TOO_BIG);
1021             } else {
1022                 fIntervalLow = (int32_t)val;
1023             }
1024         }
1025         break;
1026 
1027     case doIntervalUpperDigit:
1028         // Scanned a digit from the upper value of an {lower,upper} interval
1029         {
1030             if (fIntervalUpper < 0) {
1031                 fIntervalUpper = 0;
1032             }
1033             int32_t digitValue = u_charDigitValue(fC.fChar);
1034             U_ASSERT(digitValue >= 0);
1035             int64_t val = (int64_t)fIntervalUpper*10 + digitValue;
1036             if (val > INT32_MAX) {
1037                 error(U_REGEX_NUMBER_TOO_BIG);
1038             } else {
1039                 fIntervalUpper = (int32_t)val;
1040             }
1041         }
1042         break;
1043 
1044     case doIntervalSame:
1045         // Scanned a single value interval like {27}.  Upper = Lower.
1046         fIntervalUpper = fIntervalLow;
1047         break;
1048 
1049     case doInterval:
1050         // Finished scanning a normal {lower,upper} interval.  Generate the code for it.
1051         if (compileInlineInterval() == false) {
1052             compileInterval(URX_CTR_INIT, URX_CTR_LOOP);
1053         }
1054         break;
1055 
1056     case doPossessiveInterval:
1057         // Finished scanning a Possessive {lower,upper}+ interval.  Generate the code for it.
1058         {
1059             // Remember the loc for the top of the block being looped over.
1060             //   (Can not reserve a slot in the compiled pattern at this time, because
1061             //    compileInterval needs to reserve also, and blockTopLoc can only reserve
1062             //    once per block.)
1063             int32_t topLoc = blockTopLoc(false);
1064 
1065             // Produce normal looping code.
1066             compileInterval(URX_CTR_INIT, URX_CTR_LOOP);
1067 
1068             // Surround the just-emitted normal looping code with a STO_SP ... LD_SP
1069             //  just as if the loop was inclosed in atomic parentheses.
1070 
1071             // First the STO_SP before the start of the loop
1072             insertOp(topLoc);
1073 
1074             int32_t  varLoc = allocateData(1);   // Reserve a data location for saving the
1075             int32_t  op     = buildOp(URX_STO_SP, varLoc);
1076             fRXPat->fCompiledPat->setElementAt(op, topLoc);
1077 
1078             int32_t loopOp = (int32_t)fRXPat->fCompiledPat->popi();
1079             U_ASSERT(URX_TYPE(loopOp) == URX_CTR_LOOP && URX_VAL(loopOp) == topLoc);
1080             loopOp++;     // point LoopOp after the just-inserted STO_SP
1081             fRXPat->fCompiledPat->push(loopOp, *fStatus);
1082 
1083             // Then the LD_SP after the end of the loop
1084             appendOp(URX_LD_SP, varLoc);
1085         }
1086 
1087         break;
1088 
1089     case doNGInterval:
1090         // Finished scanning a non-greedy {lower,upper}? interval.  Generate the code for it.
1091         compileInterval(URX_CTR_INIT_NG, URX_CTR_LOOP_NG);
1092         break;
1093 
1094     case doIntervalError:
1095         error(U_REGEX_BAD_INTERVAL);
1096         break;
1097 
1098     case doLiteralChar:
1099         // We've just scanned a "normal" character from the pattern,
1100         literalChar(fC.fChar);
1101         break;
1102 
1103 
1104     case doEscapedLiteralChar:
1105         // We've just scanned an backslashed escaped character with  no
1106         //   special meaning.  It represents itself.
1107         if ((fModeFlags & UREGEX_ERROR_ON_UNKNOWN_ESCAPES) != 0 &&
1108             ((fC.fChar >= 0x41 && fC.fChar<= 0x5A) ||     // in [A-Z]
1109             (fC.fChar >= 0x61 && fC.fChar <= 0x7a))) {   // in [a-z]
1110                error(U_REGEX_BAD_ESCAPE_SEQUENCE);
1111              }
1112         literalChar(fC.fChar);
1113         break;
1114 
1115 
1116     case doDotAny:
1117         // scanned a ".",  match any single character.
1118         {
1119             fixLiterals(false);
1120             if (fModeFlags & UREGEX_DOTALL) {
1121                 appendOp(URX_DOTANY_ALL, 0);
1122             } else if (fModeFlags & UREGEX_UNIX_LINES) {
1123                 appendOp(URX_DOTANY_UNIX, 0);
1124             } else {
1125                 appendOp(URX_DOTANY, 0);
1126             }
1127         }
1128         break;
1129 
1130     case doCaret:
1131         {
1132             fixLiterals(false);
1133             if (       (fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) {
1134                 appendOp(URX_CARET, 0);
1135             } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) {
1136                 appendOp(URX_CARET_M, 0);
1137             } else if ((fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) {
1138                 appendOp(URX_CARET, 0);   // Only testing true start of input.
1139             } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) {
1140                 appendOp(URX_CARET_M_UNIX, 0);
1141             }
1142         }
1143         break;
1144 
1145     case doDollar:
1146         {
1147             fixLiterals(false);
1148             if (       (fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) {
1149                 appendOp(URX_DOLLAR, 0);
1150             } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) {
1151                 appendOp(URX_DOLLAR_M, 0);
1152             } else if ((fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) {
1153                 appendOp(URX_DOLLAR_D, 0);
1154             } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) {
1155                 appendOp(URX_DOLLAR_MD, 0);
1156             }
1157         }
1158         break;
1159 
1160     case doBackslashA:
1161         fixLiterals(false);
1162         appendOp(URX_CARET, 0);
1163         break;
1164 
1165     case doBackslashB:
1166         {
1167             #if  UCONFIG_NO_BREAK_ITERATION==1
1168             if (fModeFlags & UREGEX_UWORD) {
1169                 error(U_UNSUPPORTED_ERROR);
1170             }
1171             #endif
1172             fixLiterals(false);
1173             int32_t op = (fModeFlags & UREGEX_UWORD)? URX_BACKSLASH_BU : URX_BACKSLASH_B;
1174             appendOp(op, 1);
1175         }
1176         break;
1177 
1178     case doBackslashb:
1179         {
1180             #if  UCONFIG_NO_BREAK_ITERATION==1
1181             if (fModeFlags & UREGEX_UWORD) {
1182                 error(U_UNSUPPORTED_ERROR);
1183             }
1184             #endif
1185             fixLiterals(false);
1186             int32_t op = (fModeFlags & UREGEX_UWORD)? URX_BACKSLASH_BU : URX_BACKSLASH_B;
1187             appendOp(op, 0);
1188         }
1189         break;
1190 
1191     case doBackslashD:
1192         fixLiterals(false);
1193         appendOp(URX_BACKSLASH_D, 1);
1194         break;
1195 
1196     case doBackslashd:
1197         fixLiterals(false);
1198         appendOp(URX_BACKSLASH_D, 0);
1199         break;
1200 
1201     case doBackslashG:
1202         fixLiterals(false);
1203         appendOp(URX_BACKSLASH_G, 0);
1204         break;
1205 
1206     case doBackslashH:
1207         fixLiterals(false);
1208         appendOp(URX_BACKSLASH_H, 1);
1209         break;
1210 
1211     case doBackslashh:
1212         fixLiterals(false);
1213         appendOp(URX_BACKSLASH_H, 0);
1214         break;
1215 
1216     case doBackslashR:
1217         fixLiterals(false);
1218         appendOp(URX_BACKSLASH_R, 0);
1219         break;
1220 
1221     case doBackslashS:
1222         fixLiterals(false);
1223         appendOp(URX_STAT_SETREF_N, URX_ISSPACE_SET);
1224         break;
1225 
1226     case doBackslashs:
1227         fixLiterals(false);
1228         appendOp(URX_STATIC_SETREF, URX_ISSPACE_SET);
1229         break;
1230 
1231     case doBackslashV:
1232         fixLiterals(false);
1233         appendOp(URX_BACKSLASH_V, 1);
1234         break;
1235 
1236     case doBackslashv:
1237         fixLiterals(false);
1238         appendOp(URX_BACKSLASH_V, 0);
1239         break;
1240 
1241     case doBackslashW:
1242         fixLiterals(false);
1243         appendOp(URX_STAT_SETREF_N, URX_ISWORD_SET);
1244         break;
1245 
1246     case doBackslashw:
1247         fixLiterals(false);
1248         appendOp(URX_STATIC_SETREF, URX_ISWORD_SET);
1249         break;
1250 
1251     case doBackslashX:
1252         #if  UCONFIG_NO_BREAK_ITERATION==1
1253         // Grapheme Cluster Boundary requires ICU break iteration.
1254         error(U_UNSUPPORTED_ERROR);
1255         #endif
1256         fixLiterals(false);
1257         appendOp(URX_BACKSLASH_X, 0);
1258         break;
1259 
1260     case doBackslashZ:
1261         fixLiterals(false);
1262         appendOp(URX_DOLLAR, 0);
1263         break;
1264 
1265     case doBackslashz:
1266         fixLiterals(false);
1267         appendOp(URX_BACKSLASH_Z, 0);
1268         break;
1269 
1270     case doEscapeError:
1271         error(U_REGEX_BAD_ESCAPE_SEQUENCE);
1272         break;
1273 
1274     case doExit:
1275         fixLiterals(false);
1276         returnVal = false;
1277         break;
1278 
1279     case doProperty:
1280         {
1281             fixLiterals(false);
1282             UnicodeSet *theSet = scanProp();
1283             compileSet(theSet);
1284         }
1285         break;
1286 
1287     case doNamedChar:
1288         {
1289             UChar32 c = scanNamedChar();
1290             literalChar(c);
1291         }
1292         break;
1293 
1294 
1295     case doBackRef:
1296         // BackReference.  Somewhat unusual in that the front-end can not completely parse
1297         //                 the regular expression, because the number of digits to be consumed
1298         //                 depends on the number of capture groups that have been defined.  So
1299         //                 we have to do it here instead.
1300         {
1301             int32_t  numCaptureGroups = fRXPat->fGroupMap->size();
1302             int32_t  groupNum = 0;
1303             UChar32  c        = fC.fChar;
1304 
1305             for (;;) {
1306                 // Loop once per digit, for max allowed number of digits in a back reference.
1307                 int32_t digit = u_charDigitValue(c);
1308                 groupNum = groupNum * 10 + digit;
1309                 if (groupNum >= numCaptureGroups) {
1310                     break;
1311                 }
1312                 c = peekCharLL();
1313                 if (RegexStaticSets::gStaticSets->fRuleDigitsAlias->contains(c) == false) {
1314                     break;
1315                 }
1316                 nextCharLL();
1317             }
1318 
1319             // Scan of the back reference in the source regexp is complete.  Now generate
1320             //  the compiled code for it.
1321             // Because capture groups can be forward-referenced by back-references,
1322             //  we fill the operand with the capture group number.  At the end
1323             //  of compilation, it will be changed to the variable's location.
1324             U_ASSERT(groupNum > 0);  // Shouldn't happen.  '\0' begins an octal escape sequence,
1325                                      //    and shouldn't enter this code path at all.
1326             fixLiterals(false);
1327             if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
1328                 appendOp(URX_BACKREF_I, groupNum);
1329             } else {
1330                 appendOp(URX_BACKREF, groupNum);
1331             }
1332         }
1333         break;
1334 
1335     case doBeginNamedBackRef:
1336         U_ASSERT(fCaptureName == nullptr);
1337         fCaptureName = new UnicodeString;
1338         if (fCaptureName == nullptr) {
1339             error(U_MEMORY_ALLOCATION_ERROR);
1340         }
1341         break;
1342 
1343     case doContinueNamedBackRef:
1344         fCaptureName->append(fC.fChar);
1345         break;
1346 
1347     case doCompleteNamedBackRef:
1348         {
1349         int32_t groupNumber =
1350             fRXPat->fNamedCaptureMap ? uhash_geti(fRXPat->fNamedCaptureMap, fCaptureName) : 0;
1351         if (groupNumber == 0) {
1352             // Group name has not been defined.
1353             //   Could be a forward reference. If we choose to support them at some
1354             //   future time, extra mechanism will be required at this point.
1355             error(U_REGEX_INVALID_CAPTURE_GROUP_NAME);
1356         } else {
1357             // Given the number, handle identically to a \n numbered back reference.
1358             // See comments above, under doBackRef
1359             fixLiterals(false);
1360             if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
1361                 appendOp(URX_BACKREF_I, groupNumber);
1362             } else {
1363                 appendOp(URX_BACKREF, groupNumber);
1364             }
1365         }
1366         delete fCaptureName;
1367         fCaptureName = nullptr;
1368         break;
1369         }
1370 
1371     case doPossessivePlus:
1372         // Possessive ++ quantifier.
1373         // Compiles to
1374         //       1.   STO_SP
1375         //       2.      body of stuff being iterated over
1376         //       3.   STATE_SAVE 5
1377         //       4.   JMP        2
1378         //       5.   LD_SP
1379         //       6.   ...
1380         //
1381         //  Note:  TODO:  This is pretty inefficient.  A mass of saved state is built up
1382         //                then unconditionally discarded.  Perhaps introduce a new opcode.  Ticket 6056
1383         //
1384         {
1385             // Emit the STO_SP
1386             int32_t   topLoc = blockTopLoc(true);
1387             int32_t   stoLoc = allocateData(1);  // Reserve the data location for storing save stack ptr.
1388             int32_t   op     = buildOp(URX_STO_SP, stoLoc);
1389             fRXPat->fCompiledPat->setElementAt(op, topLoc);
1390 
1391             // Emit the STATE_SAVE
1392             appendOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+2);
1393 
1394             // Emit the JMP
1395             appendOp(URX_JMP, topLoc+1);
1396 
1397             // Emit the LD_SP
1398             appendOp(URX_LD_SP, stoLoc);
1399         }
1400         break;
1401 
1402     case doPossessiveStar:
1403         // Possessive *+ quantifier.
1404         // Compiles to
1405         //       1.   STO_SP       loc
1406         //       2.   STATE_SAVE   5
1407         //       3.      body of stuff being iterated over
1408         //       4.   JMP          2
1409         //       5.   LD_SP        loc
1410         //       6    ...
1411         // TODO:  do something to cut back the state stack each time through the loop.
1412         {
1413             // Reserve two slots at the top of the block.
1414             int32_t   topLoc = blockTopLoc(true);
1415             insertOp(topLoc);
1416 
1417             // emit   STO_SP     loc
1418             int32_t   stoLoc = allocateData(1);    // Reserve the data location for storing save stack ptr.
1419             int32_t   op     = buildOp(URX_STO_SP, stoLoc);
1420             fRXPat->fCompiledPat->setElementAt(op, topLoc);
1421 
1422             // Emit the SAVE_STATE   5
1423             int32_t L7 = fRXPat->fCompiledPat->size()+1;
1424             op = buildOp(URX_STATE_SAVE, L7);
1425             fRXPat->fCompiledPat->setElementAt(op, topLoc+1);
1426 
1427             // Append the JMP operation.
1428             appendOp(URX_JMP, topLoc+1);
1429 
1430             // Emit the LD_SP       loc
1431             appendOp(URX_LD_SP, stoLoc);
1432         }
1433         break;
1434 
1435     case doPossessiveOpt:
1436         // Possessive  ?+ quantifier.
1437         //  Compiles to
1438         //     1. STO_SP      loc
1439         //     2. SAVE_STATE  5
1440         //     3.    body of optional block
1441         //     4. LD_SP       loc
1442         //     5. ...
1443         //
1444         {
1445             // Reserve two slots at the top of the block.
1446             int32_t   topLoc = blockTopLoc(true);
1447             insertOp(topLoc);
1448 
1449             // Emit the STO_SP
1450             int32_t   stoLoc = allocateData(1);   // Reserve the data location for storing save stack ptr.
1451             int32_t   op     = buildOp(URX_STO_SP, stoLoc);
1452             fRXPat->fCompiledPat->setElementAt(op, topLoc);
1453 
1454             // Emit the SAVE_STATE
1455             int32_t   continueLoc = fRXPat->fCompiledPat->size()+1;
1456             op = buildOp(URX_STATE_SAVE, continueLoc);
1457             fRXPat->fCompiledPat->setElementAt(op, topLoc+1);
1458 
1459             // Emit the LD_SP
1460             appendOp(URX_LD_SP, stoLoc);
1461         }
1462         break;
1463 
1464 
1465     case doBeginMatchMode:
1466         fNewModeFlags = fModeFlags;
1467         fSetModeFlag  = true;
1468         break;
1469 
1470     case doMatchMode:   //  (?i)    and similar
1471         {
1472             int32_t  bit = 0;
1473             switch (fC.fChar) {
1474             case 0x69: /* 'i' */   bit = UREGEX_CASE_INSENSITIVE; break;
1475             case 0x64: /* 'd' */   bit = UREGEX_UNIX_LINES;       break;
1476             case 0x6d: /* 'm' */   bit = UREGEX_MULTILINE;        break;
1477             case 0x73: /* 's' */   bit = UREGEX_DOTALL;           break;
1478             case 0x75: /* 'u' */   bit = 0; /* Unicode casing */  break;
1479             case 0x77: /* 'w' */   bit = UREGEX_UWORD;            break;
1480             case 0x78: /* 'x' */   bit = UREGEX_COMMENTS;         break;
1481             case 0x2d: /* '-' */   fSetModeFlag = false;          break;
1482             default:
1483                 UPRV_UNREACHABLE_EXIT;  // Should never happen.  Other chars are filtered out
1484                                         // by the scanner.
1485             }
1486             if (fSetModeFlag) {
1487                 fNewModeFlags |= bit;
1488             } else {
1489                 fNewModeFlags &= ~bit;
1490             }
1491         }
1492         break;
1493 
1494     case doSetMatchMode:
1495         // Emit code to match any pending literals, using the not-yet changed match mode.
1496         fixLiterals();
1497 
1498         // We've got a (?i) or similar.  The match mode is being changed, but
1499         //   the change is not scoped to a parenthesized block.
1500         U_ASSERT(fNewModeFlags < 0);
1501         fModeFlags = fNewModeFlags;
1502 
1503         break;
1504 
1505 
1506     case doMatchModeParen:
1507         // We've got a (?i: or similar.  Begin a parenthesized block, save old
1508         //   mode flags so they can be restored at the close of the block.
1509         //
1510         //   Compile to a
1511         //      - NOP, which later may be replaced by a save-state if the
1512         //         parenthesized group gets a * quantifier, followed by
1513         //      - NOP, which may later be replaced by a save-state if there
1514         //             is an '|' alternation within the parens.
1515         {
1516             fixLiterals(false);
1517             appendOp(URX_NOP, 0);
1518             appendOp(URX_NOP, 0);
1519 
1520             // On the Parentheses stack, start a new frame and add the positions
1521             //   of the two NOPs (a normal non-capturing () frame, except for the
1522             //   saving of the original mode flags.)
1523             fParenStack.push(fModeFlags, *fStatus);
1524             fParenStack.push(flags, *fStatus);                            // Frame Marker
1525             fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The first NOP
1526             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP
1527 
1528             // Set the current mode flags to the new values.
1529             U_ASSERT(fNewModeFlags < 0);
1530             fModeFlags = fNewModeFlags;
1531         }
1532         break;
1533 
1534     case doBadModeFlag:
1535         error(U_REGEX_INVALID_FLAG);
1536         break;
1537 
1538     case doSuppressComments:
1539         // We have just scanned a '(?'.  We now need to prevent the character scanner from
1540         // treating a '#' as a to-the-end-of-line comment.
1541         //   (This Perl compatibility just gets uglier and uglier to do...)
1542         fEOLComments = false;
1543         break;
1544 
1545 
1546     case doSetAddAmp:
1547         {
1548           UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1549           set->add(chAmp);
1550         }
1551         break;
1552 
1553     case doSetAddDash:
1554         {
1555           UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1556           set->add(chDash);
1557         }
1558         break;
1559 
1560      case doSetBackslash_s:
1561         {
1562          UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1563          set->addAll(RegexStaticSets::gStaticSets->fPropSets[URX_ISSPACE_SET]);
1564          break;
1565         }
1566 
1567      case doSetBackslash_S:
1568         {
1569             UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1570             UnicodeSet SSet;
1571             SSet.addAll(RegexStaticSets::gStaticSets->fPropSets[URX_ISSPACE_SET]).complement();
1572             set->addAll(SSet);
1573             break;
1574         }
1575 
1576     case doSetBackslash_d:
1577         {
1578             UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1579             // TODO - make a static set, ticket 6058.
1580             addCategory(set, U_GC_ND_MASK, *fStatus);
1581             break;
1582         }
1583 
1584     case doSetBackslash_D:
1585         {
1586             UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1587             UnicodeSet digits;
1588             // TODO - make a static set, ticket 6058.
1589             digits.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ND_MASK, *fStatus);
1590             digits.complement();
1591             set->addAll(digits);
1592             break;
1593         }
1594 
1595     case doSetBackslash_h:
1596         {
1597             UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1598             UnicodeSet h;
1599             h.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ZS_MASK, *fStatus);
1600             h.add((UChar32)9);   // Tab
1601             set->addAll(h);
1602             break;
1603         }
1604 
1605     case doSetBackslash_H:
1606         {
1607             UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1608             UnicodeSet h;
1609             h.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ZS_MASK, *fStatus);
1610             h.add((UChar32)9);   // Tab
1611             h.complement();
1612             set->addAll(h);
1613             break;
1614         }
1615 
1616     case doSetBackslash_v:
1617         {
1618             UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1619             set->add((UChar32)0x0a, (UChar32)0x0d);  // add range
1620             set->add((UChar32)0x85);
1621             set->add((UChar32)0x2028, (UChar32)0x2029);
1622             break;
1623         }
1624 
1625     case doSetBackslash_V:
1626         {
1627             UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1628             UnicodeSet v;
1629             v.add((UChar32)0x0a, (UChar32)0x0d);  // add range
1630             v.add((UChar32)0x85);
1631             v.add((UChar32)0x2028, (UChar32)0x2029);
1632             v.complement();
1633             set->addAll(v);
1634             break;
1635         }
1636 
1637     case doSetBackslash_w:
1638         {
1639             UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1640             set->addAll(RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET]);
1641             break;
1642         }
1643 
1644     case doSetBackslash_W:
1645         {
1646             UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1647             UnicodeSet SSet;
1648             SSet.addAll(RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET]).complement();
1649             set->addAll(SSet);
1650             break;
1651         }
1652 
1653     case doSetBegin:
1654         {
1655             fixLiterals(false);
1656             LocalPointer<UnicodeSet> lpSet(new UnicodeSet(), *fStatus);
1657             fSetStack.push(lpSet.orphan(), *fStatus);
1658             fSetOpStack.push(setStart, *fStatus);
1659             if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) {
1660                 fSetOpStack.push(setCaseClose, *fStatus);
1661             }
1662             break;
1663         }
1664 
1665     case doSetBeginDifference1:
1666         //  We have scanned something like [[abc]-[
1667         //  Set up a new UnicodeSet for the set beginning with the just-scanned '['
1668         //  Push a Difference operator, which will cause the new set to be subtracted from what
1669         //    went before once it is created.
1670         setPushOp(setDifference1);
1671         fSetOpStack.push(setStart, *fStatus);
1672         if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) {
1673             fSetOpStack.push(setCaseClose, *fStatus);
1674         }
1675         break;
1676 
1677     case doSetBeginIntersection1:
1678         //  We have scanned something like  [[abc]&[
1679         //   Need both the '&' operator and the open '[' operator.
1680         setPushOp(setIntersection1);
1681         fSetOpStack.push(setStart, *fStatus);
1682         if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) {
1683             fSetOpStack.push(setCaseClose, *fStatus);
1684         }
1685         break;
1686 
1687     case doSetBeginUnion:
1688         //  We have scanned something like  [[abc][
1689         //     Need to handle the union operation explicitly [[abc] | [
1690         setPushOp(setUnion);
1691         fSetOpStack.push(setStart, *fStatus);
1692         if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) {
1693             fSetOpStack.push(setCaseClose, *fStatus);
1694         }
1695         break;
1696 
1697     case doSetDifference2:
1698         // We have scanned something like [abc--
1699         //   Consider this to unambiguously be a set difference operator.
1700         setPushOp(setDifference2);
1701         break;
1702 
1703     case doSetEnd:
1704         // Have encountered the ']' that closes a set.
1705         //    Force the evaluation of any pending operations within this set,
1706         //    leave the completed set on the top of the set stack.
1707         setEval(setEnd);
1708         U_ASSERT(fSetOpStack.peeki()==setStart);
1709         fSetOpStack.popi();
1710         break;
1711 
1712     case doSetFinish:
1713         {
1714         // Finished a complete set expression, including all nested sets.
1715         //   The close bracket has already triggered clearing out pending set operators,
1716         //    the operator stack should be empty and the operand stack should have just
1717         //    one entry, the result set.
1718         U_ASSERT(fSetOpStack.empty());
1719         UnicodeSet *theSet = (UnicodeSet *)fSetStack.pop();
1720         U_ASSERT(fSetStack.empty());
1721         compileSet(theSet);
1722         break;
1723         }
1724 
1725     case doSetIntersection2:
1726         // Have scanned something like [abc&&
1727         setPushOp(setIntersection2);
1728         break;
1729 
1730     case doSetLiteral:
1731         // Union the just-scanned literal character into the set being built.
1732         //    This operation is the highest precedence set operation, so we can always do
1733         //    it immediately, without waiting to see what follows.  It is necessary to perform
1734         //    any pending '-' or '&' operation first, because these have the same precedence
1735         //    as union-ing in a literal'
1736         {
1737             setEval(setUnion);
1738             UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
1739             s->add(fC.fChar);
1740             fLastSetLiteral = fC.fChar;
1741             break;
1742         }
1743 
1744     case doSetLiteralEscaped:
1745         // A back-slash escaped literal character was encountered.
1746         // Processing is the same as with setLiteral, above, with the addition of
1747         //  the optional check for errors on escaped ASCII letters.
1748         {
1749             if ((fModeFlags & UREGEX_ERROR_ON_UNKNOWN_ESCAPES) != 0 &&
1750                 ((fC.fChar >= 0x41 && fC.fChar<= 0x5A) ||     // in [A-Z]
1751                  (fC.fChar >= 0x61 && fC.fChar <= 0x7a))) {   // in [a-z]
1752                 error(U_REGEX_BAD_ESCAPE_SEQUENCE);
1753             }
1754             setEval(setUnion);
1755             UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
1756             s->add(fC.fChar);
1757             fLastSetLiteral = fC.fChar;
1758             break;
1759         }
1760 
1761         case doSetNamedChar:
1762         // Scanning a \N{UNICODE CHARACTER NAME}
1763         //  Aside from the source of the character, the processing is identical to doSetLiteral,
1764         //    above.
1765         {
1766             UChar32  c = scanNamedChar();
1767             setEval(setUnion);
1768             UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
1769             s->add(c);
1770             fLastSetLiteral = c;
1771             break;
1772         }
1773 
1774     case doSetNamedRange:
1775         // We have scanned literal-\N{CHAR NAME}.  Add the range to the set.
1776         // The left character is already in the set, and is saved in fLastSetLiteral.
1777         // The right side needs to be picked up, the scan is at the 'N'.
1778         // Lower Limit > Upper limit being an error matches both Java
1779         //        and ICU UnicodeSet behavior.
1780         {
1781             UChar32  c = scanNamedChar();
1782             if (U_SUCCESS(*fStatus) && (fLastSetLiteral == U_SENTINEL || fLastSetLiteral > c)) {
1783                 error(U_REGEX_INVALID_RANGE);
1784             }
1785             UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
1786             s->add(fLastSetLiteral, c);
1787             fLastSetLiteral = c;
1788             break;
1789         }
1790 
1791 
1792     case  doSetNegate:
1793         // Scanned a '^' at the start of a set.
1794         // Push the negation operator onto the set op stack.
1795         // A twist for case-insensitive matching:
1796         //   the case closure operation must happen _before_ negation.
1797         //   But the case closure operation will already be on the stack if it's required.
1798         //   This requires checking for case closure, and swapping the stack order
1799         //    if it is present.
1800         {
1801             int32_t  tosOp = fSetOpStack.peeki();
1802             if (tosOp == setCaseClose) {
1803                 fSetOpStack.popi();
1804                 fSetOpStack.push(setNegation, *fStatus);
1805                 fSetOpStack.push(setCaseClose, *fStatus);
1806             } else {
1807                 fSetOpStack.push(setNegation, *fStatus);
1808             }
1809         }
1810         break;
1811 
1812     case doSetNoCloseError:
1813         error(U_REGEX_MISSING_CLOSE_BRACKET);
1814         break;
1815 
1816     case doSetOpError:
1817         error(U_REGEX_RULE_SYNTAX);   //  -- or && at the end of a set.  Illegal.
1818         break;
1819 
1820     case doSetPosixProp:
1821         {
1822             UnicodeSet *s = scanPosixProp();
1823             if (s != nullptr) {
1824                 UnicodeSet *tos = (UnicodeSet *)fSetStack.peek();
1825                 tos->addAll(*s);
1826                 delete s;
1827             }  // else error.  scanProp() reported the error status already.
1828         }
1829         break;
1830 
1831     case doSetProp:
1832         //  Scanned a \p \P within [brackets].
1833         {
1834             UnicodeSet *s = scanProp();
1835             if (s != nullptr) {
1836                 UnicodeSet *tos = (UnicodeSet *)fSetStack.peek();
1837                 tos->addAll(*s);
1838                 delete s;
1839             }  // else error.  scanProp() reported the error status already.
1840         }
1841         break;
1842 
1843 
1844     case doSetRange:
1845         // We have scanned literal-literal.  Add the range to the set.
1846         // The left character is already in the set, and is saved in fLastSetLiteral.
1847         // The right side is the current character.
1848         // Lower Limit > Upper limit being an error matches both Java
1849         //        and ICU UnicodeSet behavior.
1850         {
1851 
1852         if (fLastSetLiteral == U_SENTINEL || fLastSetLiteral > fC.fChar) {
1853             error(U_REGEX_INVALID_RANGE);
1854         }
1855         UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
1856         s->add(fLastSetLiteral, fC.fChar);
1857         break;
1858         }
1859 
1860     default:
1861         UPRV_UNREACHABLE_EXIT;
1862     }
1863 
1864     if (U_FAILURE(*fStatus)) {
1865         returnVal = false;
1866     }
1867 
1868     return returnVal;
1869 }
1870 
1871 
1872 
1873 //------------------------------------------------------------------------------
1874 //
1875 //   literalChar           We've encountered a literal character from the pattern,
1876 //                             or an escape sequence that reduces to a character.
1877 //                         Add it to the string containing all literal chars/strings from
1878 //                             the pattern.
1879 //
1880 //------------------------------------------------------------------------------
literalChar(UChar32 c)1881 void RegexCompile::literalChar(UChar32 c)  {
1882     fLiteralChars.append(c);
1883 }
1884 
1885 
1886 //------------------------------------------------------------------------------
1887 //
1888 //    fixLiterals           When compiling something that can follow a literal
1889 //                          string in a pattern, emit the code to match the
1890 //                          accumulated literal string.
1891 //
1892 //                          Optionally, split the last char of the string off into
1893 //                          a single "ONE_CHAR" operation, so that quantifiers can
1894 //                          apply to that char alone.  Example:   abc*
1895 //                          The * must apply to the 'c' only.
1896 //
1897 //------------------------------------------------------------------------------
fixLiterals(UBool split)1898 void    RegexCompile::fixLiterals(UBool split) {
1899 
1900     // If no literal characters have been scanned but not yet had code generated
1901     //   for them, nothing needs to be done.
1902     if (fLiteralChars.length() == 0) {
1903         return;
1904     }
1905 
1906     int32_t indexOfLastCodePoint = fLiteralChars.moveIndex32(fLiteralChars.length(), -1);
1907     UChar32 lastCodePoint = fLiteralChars.char32At(indexOfLastCodePoint);
1908 
1909     // Split:  We need to  ensure that the last item in the compiled pattern
1910     //     refers only to the last literal scanned in the pattern, so that
1911     //     quantifiers (*, +, etc.) affect only it, and not a longer string.
1912     //     Split before case folding for case insensitive matches.
1913 
1914     if (split) {
1915         fLiteralChars.truncate(indexOfLastCodePoint);
1916         fixLiterals(false);   // Recursive call, emit code to match the first part of the string.
1917                               //  Note that the truncated literal string may be empty, in which case
1918                               //  nothing will be emitted.
1919 
1920         literalChar(lastCodePoint);  // Re-add the last code point as if it were a new literal.
1921         fixLiterals(false);          // Second recursive call, code for the final code point.
1922         return;
1923     }
1924 
1925     // If we are doing case-insensitive matching, case fold the string.  This may expand
1926     //   the string, e.g. the German sharp-s turns into "ss"
1927     if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
1928         fLiteralChars.foldCase();
1929         indexOfLastCodePoint = fLiteralChars.moveIndex32(fLiteralChars.length(), -1);
1930         lastCodePoint = fLiteralChars.char32At(indexOfLastCodePoint);
1931     }
1932 
1933     if (indexOfLastCodePoint == 0) {
1934         // Single character, emit a URX_ONECHAR op to match it.
1935         if ((fModeFlags & UREGEX_CASE_INSENSITIVE) &&
1936                  u_hasBinaryProperty(lastCodePoint, UCHAR_CASE_SENSITIVE)) {
1937             appendOp(URX_ONECHAR_I, lastCodePoint);
1938         } else {
1939             appendOp(URX_ONECHAR, lastCodePoint);
1940         }
1941     } else {
1942         // Two or more chars, emit a URX_STRING to match them.
1943         if (fLiteralChars.length() > 0x00ffffff || fRXPat->fLiteralText.length() > 0x00ffffff) {
1944             error(U_REGEX_PATTERN_TOO_BIG);
1945         }
1946         if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
1947             appendOp(URX_STRING_I, fRXPat->fLiteralText.length());
1948         } else {
1949             // TODO here:  add optimization to split case sensitive strings of length two
1950             //             into two single char ops, for efficiency.
1951             appendOp(URX_STRING, fRXPat->fLiteralText.length());
1952         }
1953         appendOp(URX_STRING_LEN, fLiteralChars.length());
1954 
1955         // Add this string into the accumulated strings of the compiled pattern.
1956         fRXPat->fLiteralText.append(fLiteralChars);
1957     }
1958 
1959     fLiteralChars.remove();
1960 }
1961 
1962 
buildOp(int32_t type,int32_t val)1963 int32_t RegexCompile::buildOp(int32_t type, int32_t val) {
1964     if (U_FAILURE(*fStatus)) {
1965         return 0;
1966     }
1967     if (type < 0 || type > 255) {
1968         UPRV_UNREACHABLE_EXIT;
1969     }
1970     if (val > 0x00ffffff) {
1971         UPRV_UNREACHABLE_EXIT;
1972     }
1973     if (val < 0) {
1974         if (!(type == URX_RESERVED_OP_N || type == URX_RESERVED_OP)) {
1975             UPRV_UNREACHABLE_EXIT;
1976         }
1977         if (URX_TYPE(val) != 0xff) {
1978             UPRV_UNREACHABLE_EXIT;
1979         }
1980         type = URX_RESERVED_OP_N;
1981     }
1982     return (type << 24) | val;
1983 }
1984 
1985 
1986 //------------------------------------------------------------------------------
1987 //
1988 //   appendOp()             Append a new instruction onto the compiled pattern
1989 //                          Includes error checking, limiting the size of the
1990 //                          pattern to lengths that can be represented in the
1991 //                          24 bit operand field of an instruction.
1992 //
1993 //------------------------------------------------------------------------------
appendOp(int32_t op)1994 void RegexCompile::appendOp(int32_t op) {
1995     if (U_FAILURE(*fStatus)) {
1996         return;
1997     }
1998     fRXPat->fCompiledPat->addElement(op, *fStatus);
1999     if ((fRXPat->fCompiledPat->size() > 0x00fffff0) && U_SUCCESS(*fStatus)) {
2000         error(U_REGEX_PATTERN_TOO_BIG);
2001     }
2002 }
2003 
appendOp(int32_t type,int32_t val)2004 void RegexCompile::appendOp(int32_t type, int32_t val) {
2005     appendOp(buildOp(type, val));
2006 }
2007 
2008 
2009 //------------------------------------------------------------------------------
2010 //
2011 //   insertOp()             Insert a slot for a new opcode into the already
2012 //                          compiled pattern code.
2013 //
2014 //                          Fill the slot with a NOP.  Our caller will replace it
2015 //                          with what they really wanted.
2016 //
2017 //------------------------------------------------------------------------------
insertOp(int32_t where)2018 void   RegexCompile::insertOp(int32_t where) {
2019     UVector64 *code = fRXPat->fCompiledPat;
2020     U_ASSERT(where>0 && where < code->size());
2021 
2022     int32_t  nop = buildOp(URX_NOP, 0);
2023     code->insertElementAt(nop, where, *fStatus);
2024 
2025     // Walk through the pattern, looking for any ops with targets that
2026     //  were moved down by the insert.  Fix them.
2027     int32_t loc;
2028     for (loc=0; loc<code->size(); loc++) {
2029         int32_t op = (int32_t)code->elementAti(loc);
2030         int32_t opType = URX_TYPE(op);
2031         int32_t opValue = URX_VAL(op);
2032         if ((opType == URX_JMP         ||
2033             opType == URX_JMPX         ||
2034             opType == URX_STATE_SAVE   ||
2035             opType == URX_CTR_LOOP     ||
2036             opType == URX_CTR_LOOP_NG  ||
2037             opType == URX_JMP_SAV      ||
2038             opType == URX_JMP_SAV_X    ||
2039             opType == URX_RELOC_OPRND)    && opValue > where) {
2040             // Target location for this opcode is after the insertion point and
2041             //   needs to be incremented to adjust for the insertion.
2042             opValue++;
2043             op = buildOp(opType, opValue);
2044             code->setElementAt(op, loc);
2045         }
2046     }
2047 
2048     // Now fix up the parentheses stack.  All positive values in it are locations in
2049     //  the compiled pattern.   (Negative values are frame boundaries, and don't need fixing.)
2050     for (loc=0; loc<fParenStack.size(); loc++) {
2051         int32_t x = fParenStack.elementAti(loc);
2052         U_ASSERT(x < code->size());
2053         if (x>where) {
2054             x++;
2055             fParenStack.setElementAt(x, loc);
2056         }
2057     }
2058 
2059     if (fMatchCloseParen > where) {
2060         fMatchCloseParen++;
2061     }
2062     if (fMatchOpenParen > where) {
2063         fMatchOpenParen++;
2064     }
2065 }
2066 
2067 
2068 //------------------------------------------------------------------------------
2069 //
2070 //   allocateData()        Allocate storage in the matcher's static data area.
2071 //                         Return the index for the newly allocated data.
2072 //                         The storage won't actually exist until we are running a match
2073 //                         operation, but the storage indexes are inserted into various
2074 //                         opcodes while compiling the pattern.
2075 //
2076 //------------------------------------------------------------------------------
allocateData(int32_t size)2077 int32_t RegexCompile::allocateData(int32_t size) {
2078     if (U_FAILURE(*fStatus)) {
2079         return 0;
2080     }
2081     if (size <= 0 || size > 0x100 || fRXPat->fDataSize < 0) {
2082         error(U_REGEX_INTERNAL_ERROR);
2083         return 0;
2084     }
2085     int32_t dataIndex = fRXPat->fDataSize;
2086     fRXPat->fDataSize += size;
2087     if (fRXPat->fDataSize >= 0x00fffff0) {
2088         error(U_REGEX_INTERNAL_ERROR);
2089     }
2090     return dataIndex;
2091 }
2092 
2093 
2094 //------------------------------------------------------------------------------
2095 //
2096 //   allocateStackData()   Allocate space in the back-tracking stack frame.
2097 //                         Return the index for the newly allocated data.
2098 //                         The frame indexes are inserted into various
2099 //                         opcodes while compiling the pattern, meaning that frame
2100 //                         size must be restricted to the size that will fit
2101 //                         as an operand (24 bits).
2102 //
2103 //------------------------------------------------------------------------------
allocateStackData(int32_t size)2104 int32_t RegexCompile::allocateStackData(int32_t size) {
2105     if (U_FAILURE(*fStatus)) {
2106         return 0;
2107     }
2108     if (size <= 0 || size > 0x100 || fRXPat->fFrameSize < 0) {
2109         error(U_REGEX_INTERNAL_ERROR);
2110         return 0;
2111     }
2112     int32_t dataIndex = fRXPat->fFrameSize;
2113     fRXPat->fFrameSize += size;
2114     if (fRXPat->fFrameSize >= 0x00fffff0) {
2115         error(U_REGEX_PATTERN_TOO_BIG);
2116     }
2117     return dataIndex;
2118 }
2119 
2120 
2121 //------------------------------------------------------------------------------
2122 //
2123 //   blockTopLoc()          Find or create a location in the compiled pattern
2124 //                          at the start of the operation or block that has
2125 //                          just been compiled.  Needed when a quantifier (* or
2126 //                          whatever) appears, and we need to add an operation
2127 //                          at the start of the thing being quantified.
2128 //
2129 //                          (Parenthesized Blocks) have a slot with a NOP that
2130 //                          is reserved for this purpose.  .* or similar don't
2131 //                          and a slot needs to be added.
2132 //
2133 //       parameter reserveLoc   :  true -  ensure that there is space to add an opcode
2134 //                                         at the returned location.
2135 //                                 false - just return the address,
2136 //                                         do not reserve a location there.
2137 //
2138 //------------------------------------------------------------------------------
blockTopLoc(UBool reserveLoc)2139 int32_t   RegexCompile::blockTopLoc(UBool reserveLoc) {
2140     int32_t   theLoc;
2141     fixLiterals(true);  // Emit code for any pending literals.
2142                         //   If last item was a string, emit separate op for the its last char.
2143     if (fRXPat->fCompiledPat->size() == fMatchCloseParen)
2144     {
2145         // The item just processed is a parenthesized block.
2146         theLoc = fMatchOpenParen;   // A slot is already reserved for us.
2147         U_ASSERT(theLoc > 0);
2148         U_ASSERT(URX_TYPE(((uint32_t)fRXPat->fCompiledPat->elementAti(theLoc))) == URX_NOP);
2149     }
2150     else {
2151         // Item just compiled is a single thing, a ".", or a single char, a string or a set reference.
2152         // No slot for STATE_SAVE was pre-reserved in the compiled code.
2153         // We need to make space now.
2154         theLoc = fRXPat->fCompiledPat->size()-1;
2155         int32_t opAtTheLoc = (int32_t)fRXPat->fCompiledPat->elementAti(theLoc);
2156         if (URX_TYPE(opAtTheLoc) == URX_STRING_LEN) {
2157             // Strings take two opcode, we want the position of the first one.
2158             // We can have a string at this point if a single character case-folded to two.
2159             theLoc--;
2160         }
2161         if (reserveLoc) {
2162             int32_t  nop = buildOp(URX_NOP, 0);
2163             fRXPat->fCompiledPat->insertElementAt(nop, theLoc, *fStatus);
2164         }
2165     }
2166     return theLoc;
2167 }
2168 
2169 
2170 
2171 //------------------------------------------------------------------------------
2172 //
2173 //    handleCloseParen      When compiling a close paren, we need to go back
2174 //                          and fix up any JMP or SAVE operations within the
2175 //                          parenthesized block that need to target the end
2176 //                          of the block.  The locations of these are kept on
2177 //                          the paretheses stack.
2178 //
2179 //                          This function is called both when encountering a
2180 //                          real ) and at the end of the pattern.
2181 //
2182 //------------------------------------------------------------------------------
handleCloseParen()2183 void  RegexCompile::handleCloseParen() {
2184     int32_t   patIdx;
2185     int32_t   patOp;
2186     if (fParenStack.size() <= 0) {
2187         error(U_REGEX_MISMATCHED_PAREN);
2188         return;
2189     }
2190 
2191     // Emit code for any pending literals.
2192     fixLiterals(false);
2193 
2194     // Fixup any operations within the just-closed parenthesized group
2195     //    that need to reference the end of the (block).
2196     //    (The first one popped from the stack is an unused slot for
2197     //     alternation (OR) state save, but applying the fixup to it does no harm.)
2198     for (;;) {
2199         patIdx = fParenStack.popi();
2200         if (patIdx < 0) {
2201             // value < 0 flags the start of the frame on the paren stack.
2202             break;
2203         }
2204         U_ASSERT(patIdx>0 && patIdx <= fRXPat->fCompiledPat->size());
2205         patOp = (int32_t)fRXPat->fCompiledPat->elementAti(patIdx);
2206         U_ASSERT(URX_VAL(patOp) == 0);          // Branch target for JMP should not be set.
2207         patOp |= fRXPat->fCompiledPat->size();  // Set it now.
2208         fRXPat->fCompiledPat->setElementAt(patOp, patIdx);
2209         fMatchOpenParen     = patIdx;
2210     }
2211 
2212     //  At the close of any parenthesized block, restore the match mode flags  to
2213     //  the value they had at the open paren.  Saved value is
2214     //  at the top of the paren stack.
2215     fModeFlags = fParenStack.popi();
2216     U_ASSERT(fModeFlags < 0);
2217 
2218     // DO any additional fixups, depending on the specific kind of
2219     // parentesized grouping this is
2220 
2221     switch (patIdx) {
2222     case plain:
2223     case flags:
2224         // No additional fixups required.
2225         //   (Grouping-only parentheses)
2226         break;
2227     case capturing:
2228         // Capturing Parentheses.
2229         //   Insert a End Capture op into the pattern.
2230         //   The frame offset of the variables for this cg is obtained from the
2231         //       start capture op and put it into the end-capture op.
2232         {
2233             int32_t   captureOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen+1);
2234             U_ASSERT(URX_TYPE(captureOp) == URX_START_CAPTURE);
2235 
2236             int32_t   frameVarLocation = URX_VAL(captureOp);
2237             appendOp(URX_END_CAPTURE, frameVarLocation);
2238         }
2239         break;
2240     case atomic:
2241         // Atomic Parenthesis.
2242         //   Insert a LD_SP operation to restore the state stack to the position
2243         //   it was when the atomic parens were entered.
2244         {
2245             int32_t   stoOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen+1);
2246             U_ASSERT(URX_TYPE(stoOp) == URX_STO_SP);
2247             int32_t   stoLoc = URX_VAL(stoOp);
2248             appendOp(URX_LD_SP, stoLoc);
2249         }
2250         break;
2251 
2252     case lookAhead:
2253         {
2254             int32_t  startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-5);
2255             U_ASSERT(URX_TYPE(startOp) == URX_LA_START);
2256             int32_t dataLoc  = URX_VAL(startOp);
2257             appendOp(URX_LA_END, dataLoc);
2258         }
2259         break;
2260 
2261     case negLookAhead:
2262         {
2263             // See comment at doOpenLookAheadNeg
2264             int32_t  startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-1);
2265             U_ASSERT(URX_TYPE(startOp) == URX_LA_START);
2266             int32_t dataLoc  = URX_VAL(startOp);
2267             appendOp(URX_LA_END, dataLoc);
2268             appendOp(URX_BACKTRACK, 0);
2269             appendOp(URX_LA_END, dataLoc);
2270 
2271             // Patch the URX_SAVE near the top of the block.
2272             // The destination of the SAVE is the final LA_END that was just added.
2273             int32_t saveOp   = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen);
2274             U_ASSERT(URX_TYPE(saveOp) == URX_STATE_SAVE);
2275             int32_t dest     = fRXPat->fCompiledPat->size()-1;
2276             saveOp           = buildOp(URX_STATE_SAVE, dest);
2277             fRXPat->fCompiledPat->setElementAt(saveOp, fMatchOpenParen);
2278         }
2279         break;
2280 
2281     case lookBehind:
2282         {
2283             // See comment at doOpenLookBehind.
2284 
2285             // Append the URX_LB_END and URX_LA_END to the compiled pattern.
2286             int32_t  startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-4);
2287             U_ASSERT(URX_TYPE(startOp) == URX_LB_START);
2288             int32_t dataLoc  = URX_VAL(startOp);
2289             appendOp(URX_LB_END, dataLoc);
2290             appendOp(URX_LA_END, dataLoc);
2291 
2292             // Determine the min and max bounds for the length of the
2293             //  string that the pattern can match.
2294             //  An unbounded upper limit is an error.
2295             int32_t patEnd   = fRXPat->fCompiledPat->size() - 1;
2296             int32_t minML    = minMatchLength(fMatchOpenParen, patEnd);
2297             int32_t maxML    = maxMatchLength(fMatchOpenParen, patEnd);
2298             if (URX_TYPE(maxML) != 0) {
2299                 error(U_REGEX_LOOK_BEHIND_LIMIT);
2300                 break;
2301             }
2302             if (maxML == INT32_MAX) {
2303                 error(U_REGEX_LOOK_BEHIND_LIMIT);
2304                 break;
2305             }
2306             if (minML == INT32_MAX) {
2307                 // This condition happens when no match is possible, such as with a
2308                 // [set] expression containing no elements.
2309                 // In principle, the generated code to evaluate the expression could be deleted,
2310                 // but it's probably not worth the complication.
2311                 minML = 0;
2312             }
2313             U_ASSERT(minML <= maxML);
2314 
2315             // Insert the min and max match len bounds into the URX_LB_CONT op that
2316             //  appears at the top of the look-behind block, at location fMatchOpenParen+1
2317             fRXPat->fCompiledPat->setElementAt(minML,  fMatchOpenParen-2);
2318             fRXPat->fCompiledPat->setElementAt(maxML,  fMatchOpenParen-1);
2319 
2320         }
2321         break;
2322 
2323 
2324 
2325     case lookBehindN:
2326         {
2327             // See comment at doOpenLookBehindNeg.
2328 
2329             // Append the URX_LBN_END to the compiled pattern.
2330             int32_t  startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-5);
2331             U_ASSERT(URX_TYPE(startOp) == URX_LB_START);
2332             int32_t dataLoc  = URX_VAL(startOp);
2333             appendOp(URX_LBN_END, dataLoc);
2334 
2335             // Determine the min and max bounds for the length of the
2336             //  string that the pattern can match.
2337             //  An unbounded upper limit is an error.
2338             int32_t patEnd   = fRXPat->fCompiledPat->size() - 1;
2339             int32_t minML    = minMatchLength(fMatchOpenParen, patEnd);
2340             int32_t maxML    = maxMatchLength(fMatchOpenParen, patEnd);
2341             if (URX_TYPE(maxML) != 0) {
2342                 error(U_REGEX_LOOK_BEHIND_LIMIT);
2343                 break;
2344             }
2345             if (maxML == INT32_MAX) {
2346                 error(U_REGEX_LOOK_BEHIND_LIMIT);
2347                 break;
2348             }
2349             if (minML == INT32_MAX) {
2350                 // This condition happens when no match is possible, such as with a
2351                 // [set] expression containing no elements.
2352                 // In principle, the generated code to evaluate the expression could be deleted,
2353                 // but it's probably not worth the complication.
2354                 minML = 0;
2355             }
2356 
2357             U_ASSERT(minML <= maxML);
2358 
2359             // Insert the min and max match len bounds into the URX_LB_CONT op that
2360             //  appears at the top of the look-behind block, at location fMatchOpenParen+1
2361             fRXPat->fCompiledPat->setElementAt(minML,  fMatchOpenParen-3);
2362             fRXPat->fCompiledPat->setElementAt(maxML,  fMatchOpenParen-2);
2363 
2364             // Insert the pattern location to continue at after a successful match
2365             //  as the last operand of the URX_LBN_CONT
2366             int32_t op = buildOp(URX_RELOC_OPRND, fRXPat->fCompiledPat->size());
2367             fRXPat->fCompiledPat->setElementAt(op,  fMatchOpenParen-1);
2368         }
2369         break;
2370 
2371 
2372 
2373     default:
2374         UPRV_UNREACHABLE_EXIT;
2375     }
2376 
2377     // remember the next location in the compiled pattern.
2378     // The compilation of Quantifiers will look at this to see whether its looping
2379     //   over a parenthesized block or a single item
2380     fMatchCloseParen = fRXPat->fCompiledPat->size();
2381 }
2382 
2383 
2384 
2385 //------------------------------------------------------------------------------
2386 //
2387 //   compileSet       Compile the pattern operations for a reference to a
2388 //                    UnicodeSet.
2389 //
2390 //------------------------------------------------------------------------------
compileSet(UnicodeSet * theSet)2391 void        RegexCompile::compileSet(UnicodeSet *theSet)
2392 {
2393     if (theSet == nullptr) {
2394         return;
2395     }
2396     //  Remove any strings from the set.
2397     //  There shouldn't be any, but just in case.
2398     //     (Case Closure can add them; if we had a simple case closure available that
2399     //      ignored strings, that would be better.)
2400     theSet->removeAllStrings();
2401     int32_t  setSize = theSet->size();
2402 
2403     switch (setSize) {
2404     case 0:
2405         {
2406             // Set of no elements.   Always fails to match.
2407             appendOp(URX_BACKTRACK, 0);
2408             delete theSet;
2409         }
2410         break;
2411 
2412     case 1:
2413         {
2414             // The set contains only a single code point.  Put it into
2415             //   the compiled pattern as a single char operation rather
2416             //   than a set, and discard the set itself.
2417             literalChar(theSet->charAt(0));
2418             delete theSet;
2419         }
2420         break;
2421 
2422     default:
2423         {
2424             //  The set contains two or more chars.  (the normal case)
2425             //  Put it into the compiled pattern as a set.
2426             theSet->freeze();
2427             int32_t setNumber = fRXPat->fSets->size();
2428             fRXPat->fSets->addElement(theSet, *fStatus);
2429             if (U_SUCCESS(*fStatus)) {
2430                 appendOp(URX_SETREF, setNumber);
2431             } else {
2432                 delete theSet;
2433             }
2434         }
2435     }
2436 }
2437 
2438 
2439 //------------------------------------------------------------------------------
2440 //
2441 //   compileInterval    Generate the code for a {min, max} style interval quantifier.
2442 //                      Except for the specific opcodes used, the code is the same
2443 //                      for all three types (greedy, non-greedy, possessive) of
2444 //                      intervals.  The opcodes are supplied as parameters.
2445 //                      (There are two sets of opcodes - greedy & possessive use the
2446 //                      same ones, while non-greedy has it's own.)
2447 //
2448 //                      The code for interval loops has this form:
2449 //                         0  CTR_INIT   counter loc (in stack frame)
2450 //                         1             5  patt address of CTR_LOOP at bottom of block
2451 //                         2             min count
2452 //                         3             max count   (-1 for unbounded)
2453 //                         4  ...        block to be iterated over
2454 //                         5  CTR_LOOP
2455 //
2456 //                       In
2457 //------------------------------------------------------------------------------
compileInterval(int32_t InitOp,int32_t LoopOp)2458 void        RegexCompile::compileInterval(int32_t InitOp,  int32_t LoopOp)
2459 {
2460     // The CTR_INIT op at the top of the block with the {n,m} quantifier takes
2461     //   four slots in the compiled code.  Reserve them.
2462     int32_t   topOfBlock = blockTopLoc(true);
2463     insertOp(topOfBlock);
2464     insertOp(topOfBlock);
2465     insertOp(topOfBlock);
2466 
2467     // The operands for the CTR_INIT opcode include the index in the matcher data
2468     //   of the counter.  Allocate it now. There are two data items
2469     //        counterLoc   -->  Loop counter
2470     //               +1    -->  Input index (for breaking non-progressing loops)
2471     //                          (Only present if unbounded upper limit on loop)
2472     int32_t   dataSize = fIntervalUpper < 0 ? 2 : 1;
2473     int32_t   counterLoc = allocateStackData(dataSize);
2474 
2475     int32_t   op = buildOp(InitOp, counterLoc);
2476     fRXPat->fCompiledPat->setElementAt(op, topOfBlock);
2477 
2478     // The second operand of CTR_INIT is the location following the end of the loop.
2479     //   Must put in as a URX_RELOC_OPRND so that the value will be adjusted if the
2480     //   compilation of something later on causes the code to grow and the target
2481     //   position to move.
2482     int32_t loopEnd = fRXPat->fCompiledPat->size();
2483     op = buildOp(URX_RELOC_OPRND, loopEnd);
2484     fRXPat->fCompiledPat->setElementAt(op, topOfBlock+1);
2485 
2486     // Followed by the min and max counts.
2487     fRXPat->fCompiledPat->setElementAt(fIntervalLow, topOfBlock+2);
2488     fRXPat->fCompiledPat->setElementAt(fIntervalUpper, topOfBlock+3);
2489 
2490     // Append the CTR_LOOP op.  The operand is the location of the CTR_INIT op.
2491     //   Goes at end of the block being looped over, so just append to the code so far.
2492     appendOp(LoopOp, topOfBlock);
2493 
2494     if ((fIntervalLow & 0xff000000) != 0 ||
2495         (fIntervalUpper > 0 && (fIntervalUpper & 0xff000000) != 0)) {
2496             error(U_REGEX_NUMBER_TOO_BIG);
2497         }
2498 
2499     if (fIntervalLow > fIntervalUpper && fIntervalUpper != -1) {
2500         error(U_REGEX_MAX_LT_MIN);
2501     }
2502 }
2503 
2504 
2505 
compileInlineInterval()2506 UBool RegexCompile::compileInlineInterval() {
2507     if (fIntervalUpper > 10 || fIntervalUpper < fIntervalLow) {
2508         // Too big to inline.  Fail, which will cause looping code to be generated.
2509         //   (Upper < Lower picks up unbounded upper and errors, both.)
2510         return false;
2511     }
2512 
2513     int32_t   topOfBlock = blockTopLoc(false);
2514     if (fIntervalUpper == 0) {
2515         // Pathological case.  Attempt no matches, as if the block doesn't exist.
2516         // Discard the generated code for the block.
2517         // If the block included parens, discard the info pertaining to them as well.
2518         fRXPat->fCompiledPat->setSize(topOfBlock);
2519         if (fMatchOpenParen >= topOfBlock) {
2520             fMatchOpenParen = -1;
2521         }
2522         if (fMatchCloseParen >= topOfBlock) {
2523             fMatchCloseParen = -1;
2524         }
2525         return true;
2526     }
2527 
2528     if (topOfBlock != fRXPat->fCompiledPat->size()-1 && fIntervalUpper != 1) {
2529         // The thing being repeated is not a single op, but some
2530         //   more complex block.  Do it as a loop, not inlines.
2531         //   Note that things "repeated" a max of once are handled as inline, because
2532         //     the one copy of the code already generated is just fine.
2533         return false;
2534     }
2535 
2536     // Pick up the opcode that is to be repeated
2537     //
2538     int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(topOfBlock);
2539 
2540     // Compute the pattern location where the inline sequence
2541     //   will end, and set up the state save op that will be needed.
2542     //
2543     int32_t endOfSequenceLoc = fRXPat->fCompiledPat->size()-1
2544                                 + fIntervalUpper + (fIntervalUpper-fIntervalLow);
2545     int32_t saveOp = buildOp(URX_STATE_SAVE, endOfSequenceLoc);
2546     if (fIntervalLow == 0) {
2547         insertOp(topOfBlock);
2548         fRXPat->fCompiledPat->setElementAt(saveOp, topOfBlock);
2549     }
2550 
2551 
2552 
2553     //  Loop, emitting the op for the thing being repeated each time.
2554     //    Loop starts at 1 because one instance of the op already exists in the pattern,
2555     //    it was put there when it was originally encountered.
2556     int32_t i;
2557     for (i=1; i<fIntervalUpper; i++ ) {
2558         if (i >= fIntervalLow) {
2559             appendOp(saveOp);
2560         }
2561         appendOp(op);
2562     }
2563     return true;
2564 }
2565 
2566 
2567 
2568 //------------------------------------------------------------------------------
2569 //
2570 //   caseInsensitiveStart  given a single code point from a pattern string, determine the
2571 //                         set of characters that could potentially begin a case-insensitive
2572 //                         match of a string beginning with that character, using full Unicode
2573 //                         case insensitive matching.
2574 //
2575 //          This is used in optimizing find().
2576 //
2577 //          closeOver(USET_CASE_INSENSITIVE) does most of what is needed, but
2578 //          misses cases like this:
2579 //             A string from the pattern begins with 'ss' (although all we know
2580 //                 in this context is that it begins with 's')
2581 //             The pattern could match a string beginning with a German sharp-s
2582 //
2583 //           To the ordinary case closure for a character c, we add all other
2584 //           characters cx where the case closure of cx includes a string form that begins
2585 //           with the original character c.
2586 //
2587 //           This function could be made smarter. The full pattern string is available
2588 //           and it would be possible to verify that the extra characters being added
2589 //           to the starting set fully match, rather than having just a first-char of the
2590 //           folded form match.
2591 //
2592 //------------------------------------------------------------------------------
findCaseInsensitiveStarters(UChar32 c,UnicodeSet * starterChars)2593 void  RegexCompile::findCaseInsensitiveStarters(UChar32 c, UnicodeSet *starterChars) {
2594 
2595 // Machine Generated below.
2596 // It may need updating with new versions of Unicode.
2597 // Intltest test RegexTest::TestCaseInsensitiveStarters will fail if an update is needed.
2598 // The update tool is here:
2599 // https://github.com/unicode-org/icu/tree/main/tools/unicode/c/genregexcasing
2600 
2601 // Machine Generated Data. Do not hand edit.
2602     static const UChar32 RECaseFixCodePoints[] = {
2603         0x61, 0x66, 0x68, 0x69, 0x6a, 0x73, 0x74, 0x77, 0x79, 0x2bc,
2604         0x3ac, 0x3ae, 0x3b1, 0x3b7, 0x3b9, 0x3c1, 0x3c5, 0x3c9, 0x3ce, 0x565,
2605         0x574, 0x57e, 0x1f00, 0x1f01, 0x1f02, 0x1f03, 0x1f04, 0x1f05, 0x1f06, 0x1f07,
2606         0x1f20, 0x1f21, 0x1f22, 0x1f23, 0x1f24, 0x1f25, 0x1f26, 0x1f27, 0x1f60, 0x1f61,
2607         0x1f62, 0x1f63, 0x1f64, 0x1f65, 0x1f66, 0x1f67, 0x1f70, 0x1f74, 0x1f7c, 0x110000};
2608 
2609     static const int16_t RECaseFixStringOffsets[] = {
2610         0x0, 0x1, 0x6, 0x7, 0x8, 0x9, 0xd, 0xe, 0xf, 0x10,
2611         0x11, 0x12, 0x13, 0x17, 0x1b, 0x20, 0x21, 0x2a, 0x2e, 0x2f,
2612         0x30, 0x34, 0x35, 0x37, 0x39, 0x3b, 0x3d, 0x3f, 0x41, 0x43,
2613         0x45, 0x47, 0x49, 0x4b, 0x4d, 0x4f, 0x51, 0x53, 0x55, 0x57,
2614         0x59, 0x5b, 0x5d, 0x5f, 0x61, 0x63, 0x65, 0x66, 0x67, 0};
2615 
2616     static const int16_t RECaseFixCounts[] = {
2617         0x1, 0x5, 0x1, 0x1, 0x1, 0x4, 0x1, 0x1, 0x1, 0x1,
2618         0x1, 0x1, 0x4, 0x4, 0x5, 0x1, 0x9, 0x4, 0x1, 0x1,
2619         0x4, 0x1, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2,
2620         0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2,
2621         0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x1, 0x1, 0x1, 0};
2622 
2623     static const char16_t RECaseFixData[] = {
2624         0x1e9a, 0xfb00, 0xfb01, 0xfb02, 0xfb03, 0xfb04, 0x1e96, 0x130, 0x1f0, 0xdf,
2625         0x1e9e, 0xfb05, 0xfb06, 0x1e97, 0x1e98, 0x1e99, 0x149, 0x1fb4, 0x1fc4, 0x1fb3,
2626         0x1fb6, 0x1fb7, 0x1fbc, 0x1fc3, 0x1fc6, 0x1fc7, 0x1fcc, 0x390, 0x1fd2, 0x1fd3,
2627         0x1fd6, 0x1fd7, 0x1fe4, 0x3b0, 0x1f50, 0x1f52, 0x1f54, 0x1f56, 0x1fe2, 0x1fe3,
2628         0x1fe6, 0x1fe7, 0x1ff3, 0x1ff6, 0x1ff7, 0x1ffc, 0x1ff4, 0x587, 0xfb13, 0xfb14,
2629         0xfb15, 0xfb17, 0xfb16, 0x1f80, 0x1f88, 0x1f81, 0x1f89, 0x1f82, 0x1f8a, 0x1f83,
2630         0x1f8b, 0x1f84, 0x1f8c, 0x1f85, 0x1f8d, 0x1f86, 0x1f8e, 0x1f87, 0x1f8f, 0x1f90,
2631         0x1f98, 0x1f91, 0x1f99, 0x1f92, 0x1f9a, 0x1f93, 0x1f9b, 0x1f94, 0x1f9c, 0x1f95,
2632         0x1f9d, 0x1f96, 0x1f9e, 0x1f97, 0x1f9f, 0x1fa0, 0x1fa8, 0x1fa1, 0x1fa9, 0x1fa2,
2633         0x1faa, 0x1fa3, 0x1fab, 0x1fa4, 0x1fac, 0x1fa5, 0x1fad, 0x1fa6, 0x1fae, 0x1fa7,
2634         0x1faf, 0x1fb2, 0x1fc2, 0x1ff2, 0};
2635 
2636 // End of machine generated data.
2637 
2638     if (c < UCHAR_MIN_VALUE || c > UCHAR_MAX_VALUE) {
2639         // This function should never be called with an invalid input character.
2640         UPRV_UNREACHABLE_EXIT;
2641     } else if (u_hasBinaryProperty(c, UCHAR_CASE_SENSITIVE)) {
2642         UChar32 caseFoldedC  = u_foldCase(c, U_FOLD_CASE_DEFAULT);
2643         starterChars->set(caseFoldedC, caseFoldedC);
2644 
2645         int32_t i;
2646         for (i=0; RECaseFixCodePoints[i]<c ; i++) {
2647             // Simple linear search through the sorted list of interesting code points.
2648         }
2649 
2650         if (RECaseFixCodePoints[i] == c) {
2651             int32_t dataIndex = RECaseFixStringOffsets[i];
2652             int32_t numCharsToAdd = RECaseFixCounts[i];
2653             UChar32 cpToAdd = 0;
2654             for (int32_t j=0; j<numCharsToAdd; j++) {
2655                 U16_NEXT_UNSAFE(RECaseFixData, dataIndex, cpToAdd);
2656                 starterChars->add(cpToAdd);
2657             }
2658         }
2659 
2660         starterChars->closeOver(USET_CASE_INSENSITIVE);
2661         starterChars->removeAllStrings();
2662     } else {
2663         // Not a cased character. Just return it alone.
2664         starterChars->set(c, c);
2665     }
2666 }
2667 
2668 
2669 // Increment with overflow check.
2670 // val and delta will both be positive.
2671 
safeIncrement(int32_t val,int32_t delta)2672 static int32_t safeIncrement(int32_t val, int32_t delta) {
2673     if (INT32_MAX - val > delta) {
2674         return val + delta;
2675     } else {
2676         return INT32_MAX;
2677     }
2678 }
2679 
2680 
2681 //------------------------------------------------------------------------------
2682 //
2683 //   matchStartType    Determine how a match can start.
2684 //                     Used to optimize find() operations.
2685 //
2686 //                     Operation is very similar to minMatchLength().  Walk the compiled
2687 //                     pattern, keeping an on-going minimum-match-length.  For any
2688 //                     op where the min match coming in is zero, add that ops possible
2689 //                     starting matches to the possible starts for the overall pattern.
2690 //
2691 //------------------------------------------------------------------------------
matchStartType()2692 void   RegexCompile::matchStartType() {
2693     if (U_FAILURE(*fStatus)) {
2694         return;
2695     }
2696 
2697 
2698     int32_t    loc;                    // Location in the pattern of the current op being processed.
2699     int32_t    op;                     // The op being processed
2700     int32_t    opType;                 // The opcode type of the op
2701     int32_t    currentLen = 0;         // Minimum length of a match to this point (loc) in the pattern
2702     int32_t    numInitialStrings = 0;  // Number of strings encountered that could match at start.
2703 
2704     UBool      atStart = true;         // True if no part of the pattern yet encountered
2705                                        //   could have advanced the position in a match.
2706                                        //   (Maximum match length so far == 0)
2707 
2708     // forwardedLength is a vector holding minimum-match-length values that
2709     //   are propagated forward in the pattern by JMP or STATE_SAVE operations.
2710     //   It must be one longer than the pattern being checked because some  ops
2711     //   will jmp to a end-of-block+1 location from within a block, and we must
2712     //   count those when checking the block.
2713     int32_t end = fRXPat->fCompiledPat->size();
2714     UVector32  forwardedLength(end+1, *fStatus);
2715     forwardedLength.setSize(end+1);
2716     for (loc=3; loc<end; loc++) {
2717         forwardedLength.setElementAt(INT32_MAX, loc);
2718     }
2719 
2720     for (loc = 3; loc<end; loc++) {
2721         op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
2722         opType = URX_TYPE(op);
2723 
2724         // The loop is advancing linearly through the pattern.
2725         // If the op we are now at was the destination of a branch in the pattern,
2726         // and that path has a shorter minimum length than the current accumulated value,
2727         // replace the current accumulated value.
2728         if (forwardedLength.elementAti(loc) < currentLen) {
2729             currentLen = forwardedLength.elementAti(loc);
2730             U_ASSERT(currentLen>=0 && currentLen < INT32_MAX);
2731         }
2732 
2733         switch (opType) {
2734             // Ops that don't change the total length matched
2735         case URX_RESERVED_OP:
2736         case URX_END:
2737         case URX_FAIL:
2738         case URX_STRING_LEN:
2739         case URX_NOP:
2740         case URX_START_CAPTURE:
2741         case URX_END_CAPTURE:
2742         case URX_BACKSLASH_B:
2743         case URX_BACKSLASH_BU:
2744         case URX_BACKSLASH_G:
2745         case URX_BACKSLASH_Z:
2746         case URX_DOLLAR:
2747         case URX_DOLLAR_M:
2748         case URX_DOLLAR_D:
2749         case URX_DOLLAR_MD:
2750         case URX_RELOC_OPRND:
2751         case URX_STO_INP_LOC:
2752         case URX_BACKREF:         // BackRef.  Must assume that it might be a zero length match
2753         case URX_BACKREF_I:
2754 
2755         case URX_STO_SP:          // Setup for atomic or possessive blocks.  Doesn't change what can match.
2756         case URX_LD_SP:
2757             break;
2758 
2759         case URX_CARET:
2760             if (atStart) {
2761                 fRXPat->fStartType = START_START;
2762             }
2763             break;
2764 
2765         case URX_CARET_M:
2766         case URX_CARET_M_UNIX:
2767             if (atStart) {
2768                 fRXPat->fStartType = START_LINE;
2769             }
2770             break;
2771 
2772         case URX_ONECHAR:
2773             if (currentLen == 0) {
2774                 // This character could appear at the start of a match.
2775                 //   Add it to the set of possible starting characters.
2776                 fRXPat->fInitialChars->add(URX_VAL(op));
2777                 numInitialStrings += 2;
2778             }
2779             currentLen = safeIncrement(currentLen, 1);
2780             atStart = false;
2781             break;
2782 
2783 
2784         case URX_SETREF:
2785             if (currentLen == 0) {
2786                 int32_t  sn = URX_VAL(op);
2787                 U_ASSERT(sn > 0 && sn < fRXPat->fSets->size());
2788                 const UnicodeSet *s = (UnicodeSet *)fRXPat->fSets->elementAt(sn);
2789                 fRXPat->fInitialChars->addAll(*s);
2790                 numInitialStrings += 2;
2791             }
2792             currentLen = safeIncrement(currentLen, 1);
2793             atStart = false;
2794             break;
2795 
2796         case URX_LOOP_SR_I:
2797             // [Set]*, like a SETREF, above, in what it can match,
2798             //  but may not match at all, so currentLen is not incremented.
2799             if (currentLen == 0) {
2800                 int32_t  sn = URX_VAL(op);
2801                 U_ASSERT(sn > 0 && sn < fRXPat->fSets->size());
2802                 const UnicodeSet *s = (UnicodeSet *)fRXPat->fSets->elementAt(sn);
2803                 fRXPat->fInitialChars->addAll(*s);
2804                 numInitialStrings += 2;
2805             }
2806             atStart = false;
2807             break;
2808 
2809         case URX_LOOP_DOT_I:
2810             if (currentLen == 0) {
2811                 // .* at the start of a pattern.
2812                 //    Any character can begin the match.
2813                 fRXPat->fInitialChars->clear();
2814                 fRXPat->fInitialChars->complement();
2815                 numInitialStrings += 2;
2816             }
2817             atStart = false;
2818             break;
2819 
2820 
2821         case URX_STATIC_SETREF:
2822             if (currentLen == 0) {
2823                 int32_t  sn = URX_VAL(op);
2824                 U_ASSERT(sn>0 && sn<URX_LAST_SET);
2825                 const UnicodeSet &s = RegexStaticSets::gStaticSets->fPropSets[sn];
2826                 fRXPat->fInitialChars->addAll(s);
2827                 numInitialStrings += 2;
2828             }
2829             currentLen = safeIncrement(currentLen, 1);
2830             atStart = false;
2831             break;
2832 
2833 
2834 
2835         case URX_STAT_SETREF_N:
2836             if (currentLen == 0) {
2837                 int32_t  sn = URX_VAL(op);
2838                 UnicodeSet sc;
2839                 sc.addAll(RegexStaticSets::gStaticSets->fPropSets[sn]).complement();
2840                 fRXPat->fInitialChars->addAll(sc);
2841                 numInitialStrings += 2;
2842             }
2843             currentLen = safeIncrement(currentLen, 1);
2844             atStart = false;
2845             break;
2846 
2847 
2848 
2849         case URX_BACKSLASH_D:
2850             // Digit Char
2851              if (currentLen == 0) {
2852                  UnicodeSet s;
2853                  s.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ND_MASK, *fStatus);
2854                  if (URX_VAL(op) != 0) {
2855                      s.complement();
2856                  }
2857                  fRXPat->fInitialChars->addAll(s);
2858                  numInitialStrings += 2;
2859             }
2860             currentLen = safeIncrement(currentLen, 1);
2861             atStart = false;
2862             break;
2863 
2864 
2865         case URX_BACKSLASH_H:
2866             // Horiz white space
2867             if (currentLen == 0) {
2868                 UnicodeSet s;
2869                 s.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ZS_MASK, *fStatus);
2870                 s.add((UChar32)9);   // Tab
2871                 if (URX_VAL(op) != 0) {
2872                     s.complement();
2873                 }
2874                 fRXPat->fInitialChars->addAll(s);
2875                 numInitialStrings += 2;
2876             }
2877             currentLen = safeIncrement(currentLen, 1);
2878             atStart = false;
2879             break;
2880 
2881 
2882         case URX_BACKSLASH_R:       // Any line ending sequence
2883         case URX_BACKSLASH_V:       // Any line ending code point, with optional negation
2884             if (currentLen == 0) {
2885                 UnicodeSet s;
2886                 s.add((UChar32)0x0a, (UChar32)0x0d);  // add range
2887                 s.add((UChar32)0x85);
2888                 s.add((UChar32)0x2028, (UChar32)0x2029);
2889                 if (URX_VAL(op) != 0) {
2890                      // Complement option applies to URX_BACKSLASH_V only.
2891                      s.complement();
2892                 }
2893                 fRXPat->fInitialChars->addAll(s);
2894                 numInitialStrings += 2;
2895             }
2896             currentLen = safeIncrement(currentLen, 1);
2897             atStart = false;
2898             break;
2899 
2900 
2901 
2902         case URX_ONECHAR_I:
2903             // Case Insensitive Single Character.
2904             if (currentLen == 0) {
2905                 UChar32  c = URX_VAL(op);
2906                 if (u_hasBinaryProperty(c, UCHAR_CASE_SENSITIVE)) {
2907                     UnicodeSet starters(c, c);
2908                     starters.closeOver(USET_CASE_INSENSITIVE);
2909                     // findCaseInsensitiveStarters(c, &starters);
2910                     //   For ONECHAR_I, no need to worry about text chars that expand on folding into strings.
2911                     //   The expanded folding can't match the pattern.
2912                     fRXPat->fInitialChars->addAll(starters);
2913                 } else {
2914                     // Char has no case variants.  Just add it as-is to the
2915                     //   set of possible starting chars.
2916                     fRXPat->fInitialChars->add(c);
2917                 }
2918                 numInitialStrings += 2;
2919             }
2920             currentLen = safeIncrement(currentLen, 1);
2921             atStart = false;
2922             break;
2923 
2924 
2925         case URX_BACKSLASH_X:   // Grapheme Cluster.  Minimum is 1, max unbounded.
2926         case URX_DOTANY_ALL:    // . matches one or two.
2927         case URX_DOTANY:
2928         case URX_DOTANY_UNIX:
2929             if (currentLen == 0) {
2930                 // These constructs are all bad news when they appear at the start
2931                 //   of a match.  Any character can begin the match.
2932                 fRXPat->fInitialChars->clear();
2933                 fRXPat->fInitialChars->complement();
2934                 numInitialStrings += 2;
2935             }
2936             currentLen = safeIncrement(currentLen, 1);
2937             atStart = false;
2938             break;
2939 
2940 
2941         case URX_JMPX:
2942             loc++;             // Except for extra operand on URX_JMPX, same as URX_JMP.
2943             U_FALLTHROUGH;
2944         case URX_JMP:
2945             {
2946                 int32_t  jmpDest = URX_VAL(op);
2947                 if (jmpDest < loc) {
2948                     // Loop of some kind.  Can safely ignore, the worst that will happen
2949                     //  is that we understate the true minimum length
2950                     currentLen = forwardedLength.elementAti(loc+1);
2951 
2952                 } else {
2953                     // Forward jump.  Propagate the current min length to the target loc of the jump.
2954                     U_ASSERT(jmpDest <= end+1);
2955                     if (forwardedLength.elementAti(jmpDest) > currentLen) {
2956                         forwardedLength.setElementAt(currentLen, jmpDest);
2957                     }
2958                 }
2959             }
2960             atStart = false;
2961             break;
2962 
2963         case URX_JMP_SAV:
2964         case URX_JMP_SAV_X:
2965             // Combo of state save to the next loc, + jmp backwards.
2966             //   Net effect on min. length computation is nothing.
2967             atStart = false;
2968             break;
2969 
2970         case URX_BACKTRACK:
2971             // Fails are kind of like a branch, except that the min length was
2972             //   propagated already, by the state save.
2973             currentLen = forwardedLength.elementAti(loc+1);
2974             atStart = false;
2975             break;
2976 
2977 
2978         case URX_STATE_SAVE:
2979             {
2980                 // State Save, for forward jumps, propagate the current minimum.
2981                 //             of the state save.
2982                 int32_t  jmpDest = URX_VAL(op);
2983                 if (jmpDest > loc) {
2984                     if (currentLen < forwardedLength.elementAti(jmpDest)) {
2985                         forwardedLength.setElementAt(currentLen, jmpDest);
2986                     }
2987                 }
2988             }
2989             atStart = false;
2990             break;
2991 
2992 
2993 
2994 
2995         case URX_STRING:
2996             {
2997                 loc++;
2998                 int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
2999                 int32_t stringLen   = URX_VAL(stringLenOp);
3000                 U_ASSERT(URX_TYPE(stringLenOp) == URX_STRING_LEN);
3001                 U_ASSERT(stringLenOp >= 2);
3002                 if (currentLen == 0) {
3003                     // Add the starting character of this string to the set of possible starting
3004                     //   characters for this pattern.
3005                     int32_t stringStartIdx = URX_VAL(op);
3006                     UChar32  c = fRXPat->fLiteralText.char32At(stringStartIdx);
3007                     fRXPat->fInitialChars->add(c);
3008 
3009                     // Remember this string.  After the entire pattern has been checked,
3010                     //  if nothing else is identified that can start a match, we'll use it.
3011                     numInitialStrings++;
3012                     fRXPat->fInitialStringIdx = stringStartIdx;
3013                     fRXPat->fInitialStringLen = stringLen;
3014                 }
3015 
3016                 currentLen = safeIncrement(currentLen, stringLen);
3017                 atStart = false;
3018             }
3019             break;
3020 
3021         case URX_STRING_I:
3022             {
3023                 // Case-insensitive string.  Unlike exact-match strings, we won't
3024                 //   attempt a string search for possible match positions.  But we
3025                 //   do update the set of possible starting characters.
3026                 loc++;
3027                 int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3028                 int32_t stringLen   = URX_VAL(stringLenOp);
3029                 U_ASSERT(URX_TYPE(stringLenOp) == URX_STRING_LEN);
3030                 U_ASSERT(stringLenOp >= 2);
3031                 if (currentLen == 0) {
3032                     // Add the starting character of this string to the set of possible starting
3033                     //   characters for this pattern.
3034                     int32_t stringStartIdx = URX_VAL(op);
3035                     UChar32  c = fRXPat->fLiteralText.char32At(stringStartIdx);
3036                     UnicodeSet s;
3037                     findCaseInsensitiveStarters(c, &s);
3038                     fRXPat->fInitialChars->addAll(s);
3039                     numInitialStrings += 2;  // Matching on an initial string not possible.
3040                 }
3041                 currentLen = safeIncrement(currentLen, stringLen);
3042                 atStart = false;
3043             }
3044             break;
3045 
3046         case URX_CTR_INIT:
3047         case URX_CTR_INIT_NG:
3048             {
3049                 // Loop Init Ops.  These don't change the min length, but they are 4 word ops
3050                 //   so location must be updated accordingly.
3051                 // Loop Init Ops.
3052                 //   If the min loop count == 0
3053                 //      move loc forwards to the end of the loop, skipping over the body.
3054                 //   If the min count is > 0,
3055                 //      continue normal processing of the body of the loop.
3056                 int32_t loopEndLoc   = (int32_t)fRXPat->fCompiledPat->elementAti(loc+1);
3057                         loopEndLoc   = URX_VAL(loopEndLoc);
3058                 int32_t minLoopCount = (int32_t)fRXPat->fCompiledPat->elementAti(loc+2);
3059                 if (minLoopCount == 0) {
3060                     // Min Loop Count of 0, treat like a forward branch and
3061                     //   move the current minimum length up to the target
3062                     //   (end of loop) location.
3063                     U_ASSERT(loopEndLoc <= end+1);
3064                     if (forwardedLength.elementAti(loopEndLoc) > currentLen) {
3065                         forwardedLength.setElementAt(currentLen, loopEndLoc);
3066                     }
3067                 }
3068                 loc+=3;  // Skips over operands of CTR_INIT
3069             }
3070             atStart = false;
3071             break;
3072 
3073 
3074         case URX_CTR_LOOP:
3075         case URX_CTR_LOOP_NG:
3076             // Loop ops.
3077             //  The jump is conditional, backwards only.
3078             atStart = false;
3079             break;
3080 
3081         case URX_LOOP_C:
3082             // More loop ops.  These state-save to themselves.
3083             //   don't change the minimum match
3084             atStart = false;
3085             break;
3086 
3087 
3088         case URX_LA_START:
3089         case URX_LB_START:
3090             {
3091                 // Look-around.  Scan forward until the matching look-ahead end,
3092                 //   without processing the look-around block.  This is overly pessimistic.
3093 
3094                 // Keep track of the nesting depth of look-around blocks.  Boilerplate code for
3095                 //   lookahead contains two LA_END instructions, so count goes up by two
3096                 //   for each LA_START.
3097                 int32_t  depth = (opType == URX_LA_START? 2: 1);
3098                 for (;;) {
3099                     loc++;
3100                     op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3101                     if (URX_TYPE(op) == URX_LA_START) {
3102                         depth+=2;
3103                     }
3104                     if (URX_TYPE(op) == URX_LB_START) {
3105                         depth++;
3106                     }
3107                     if (URX_TYPE(op) == URX_LA_END || URX_TYPE(op)==URX_LBN_END) {
3108                         depth--;
3109                         if (depth == 0) {
3110                             break;
3111                         }
3112                     }
3113                     if (URX_TYPE(op) == URX_STATE_SAVE) {
3114                         // Need this because neg lookahead blocks will FAIL to outside
3115                         //   of the block.
3116                         int32_t  jmpDest = URX_VAL(op);
3117                         if (jmpDest > loc) {
3118                             if (currentLen < forwardedLength.elementAti(jmpDest)) {
3119                                 forwardedLength.setElementAt(currentLen, jmpDest);
3120                             }
3121                         }
3122                     }
3123                     U_ASSERT(loc <= end);
3124                 }
3125             }
3126             break;
3127 
3128         case URX_LA_END:
3129         case URX_LB_CONT:
3130         case URX_LB_END:
3131         case URX_LBN_CONT:
3132         case URX_LBN_END:
3133             UPRV_UNREACHABLE_EXIT;  // Shouldn't get here.  These ops should be
3134                                     //  consumed by the scan in URX_LA_START and LB_START
3135         default:
3136             UPRV_UNREACHABLE_EXIT;
3137             }
3138 
3139         }
3140 
3141 
3142     // We have finished walking through the ops.  Check whether some forward jump
3143     //   propagated a shorter length to location end+1.
3144     if (forwardedLength.elementAti(end+1) < currentLen) {
3145         currentLen = forwardedLength.elementAti(end+1);
3146     }
3147 
3148 
3149     fRXPat->fInitialChars8->init(fRXPat->fInitialChars);
3150 
3151 
3152     // Sort out what we should check for when looking for candidate match start positions.
3153     // In order of preference,
3154     //     1.   Start of input text buffer.
3155     //     2.   A literal string.
3156     //     3.   Start of line in multi-line mode.
3157     //     4.   A single literal character.
3158     //     5.   A character from a set of characters.
3159     //
3160     if (fRXPat->fStartType == START_START) {
3161         // Match only at the start of an input text string.
3162         //    start type is already set.  We're done.
3163     } else if (numInitialStrings == 1 && fRXPat->fMinMatchLen > 0) {
3164         // Match beginning only with a literal string.
3165         UChar32  c = fRXPat->fLiteralText.char32At(fRXPat->fInitialStringIdx);
3166         U_ASSERT(fRXPat->fInitialChars->contains(c));
3167         fRXPat->fStartType   = START_STRING;
3168         fRXPat->fInitialChar = c;
3169     } else if (fRXPat->fStartType == START_LINE) {
3170         // Match at start of line in Multi-Line mode.
3171         // Nothing to do here; everything is already set.
3172     } else if (fRXPat->fMinMatchLen == 0) {
3173         // Zero length match possible.  We could start anywhere.
3174         fRXPat->fStartType = START_NO_INFO;
3175     } else if (fRXPat->fInitialChars->size() == 1) {
3176         // All matches begin with the same char.
3177         fRXPat->fStartType   = START_CHAR;
3178         fRXPat->fInitialChar = fRXPat->fInitialChars->charAt(0);
3179         U_ASSERT(fRXPat->fInitialChar != (UChar32)-1);
3180     } else if (fRXPat->fInitialChars->contains((UChar32)0, (UChar32)0x10ffff) == false &&
3181         fRXPat->fMinMatchLen > 0) {
3182         // Matches start with a set of character smaller than the set of all chars.
3183         fRXPat->fStartType = START_SET;
3184     } else {
3185         // Matches can start with anything
3186         fRXPat->fStartType = START_NO_INFO;
3187     }
3188 }
3189 
3190 
3191 
3192 //------------------------------------------------------------------------------
3193 //
3194 //   minMatchLength    Calculate the length of the shortest string that could
3195 //                     match the specified pattern.
3196 //                     Length is in 16 bit code units, not code points.
3197 //
3198 //                     The calculated length may not be exact.  The returned
3199 //                     value may be shorter than the actual minimum; it must
3200 //                     never be longer.
3201 //
3202 //                     start and end are the range of p-code operations to be
3203 //                     examined.  The endpoints are included in the range.
3204 //
3205 //------------------------------------------------------------------------------
minMatchLength(int32_t start,int32_t end)3206 int32_t   RegexCompile::minMatchLength(int32_t start, int32_t end) {
3207     if (U_FAILURE(*fStatus)) {
3208         return 0;
3209     }
3210 
3211     U_ASSERT(start <= end);
3212     U_ASSERT(end < fRXPat->fCompiledPat->size());
3213 
3214 
3215     int32_t    loc;
3216     int32_t    op;
3217     int32_t    opType;
3218     int32_t    currentLen = 0;
3219 
3220 
3221     // forwardedLength is a vector holding minimum-match-length values that
3222     //   are propagated forward in the pattern by JMP or STATE_SAVE operations.
3223     //   It must be one longer than the pattern being checked because some  ops
3224     //   will jmp to a end-of-block+1 location from within a block, and we must
3225     //   count those when checking the block.
3226     UVector32  forwardedLength(end+2, *fStatus);
3227     forwardedLength.setSize(end+2);
3228     for (loc=start; loc<=end+1; loc++) {
3229         forwardedLength.setElementAt(INT32_MAX, loc);
3230     }
3231 
3232     for (loc = start; loc<=end; loc++) {
3233         op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3234         opType = URX_TYPE(op);
3235 
3236         // The loop is advancing linearly through the pattern.
3237         // If the op we are now at was the destination of a branch in the pattern,
3238         // and that path has a shorter minimum length than the current accumulated value,
3239         // replace the current accumulated value.
3240         // U_ASSERT(currentLen>=0 && currentLen < INT32_MAX);  // MinLength == INT32_MAX for some
3241                                                                //   no-match-possible cases.
3242         if (forwardedLength.elementAti(loc) < currentLen) {
3243             currentLen = forwardedLength.elementAti(loc);
3244             U_ASSERT(currentLen>=0 && currentLen < INT32_MAX);
3245         }
3246 
3247         switch (opType) {
3248             // Ops that don't change the total length matched
3249         case URX_RESERVED_OP:
3250         case URX_END:
3251         case URX_STRING_LEN:
3252         case URX_NOP:
3253         case URX_START_CAPTURE:
3254         case URX_END_CAPTURE:
3255         case URX_BACKSLASH_B:
3256         case URX_BACKSLASH_BU:
3257         case URX_BACKSLASH_G:
3258         case URX_BACKSLASH_Z:
3259         case URX_CARET:
3260         case URX_DOLLAR:
3261         case URX_DOLLAR_M:
3262         case URX_DOLLAR_D:
3263         case URX_DOLLAR_MD:
3264         case URX_RELOC_OPRND:
3265         case URX_STO_INP_LOC:
3266         case URX_CARET_M:
3267         case URX_CARET_M_UNIX:
3268         case URX_BACKREF:         // BackRef.  Must assume that it might be a zero length match
3269         case URX_BACKREF_I:
3270 
3271         case URX_STO_SP:          // Setup for atomic or possessive blocks.  Doesn't change what can match.
3272         case URX_LD_SP:
3273 
3274         case URX_JMP_SAV:
3275         case URX_JMP_SAV_X:
3276             break;
3277 
3278 
3279             // Ops that match a minimum of one character (one or two 16 bit code units.)
3280             //
3281         case URX_ONECHAR:
3282         case URX_STATIC_SETREF:
3283         case URX_STAT_SETREF_N:
3284         case URX_SETREF:
3285         case URX_BACKSLASH_D:
3286         case URX_BACKSLASH_H:
3287         case URX_BACKSLASH_R:
3288         case URX_BACKSLASH_V:
3289         case URX_ONECHAR_I:
3290         case URX_BACKSLASH_X:   // Grapheme Cluster.  Minimum is 1, max unbounded.
3291         case URX_DOTANY_ALL:    // . matches one or two.
3292         case URX_DOTANY:
3293         case URX_DOTANY_UNIX:
3294             currentLen = safeIncrement(currentLen, 1);
3295             break;
3296 
3297 
3298         case URX_JMPX:
3299             loc++;              // URX_JMPX has an extra operand, ignored here,
3300                                 //   otherwise processed identically to URX_JMP.
3301             U_FALLTHROUGH;
3302         case URX_JMP:
3303             {
3304                 int32_t  jmpDest = URX_VAL(op);
3305                 if (jmpDest < loc) {
3306                     // Loop of some kind.  Can safely ignore, the worst that will happen
3307                     //  is that we understate the true minimum length
3308                     currentLen = forwardedLength.elementAti(loc+1);
3309                 } else {
3310                     // Forward jump.  Propagate the current min length to the target loc of the jump.
3311                     U_ASSERT(jmpDest <= end+1);
3312                     if (forwardedLength.elementAti(jmpDest) > currentLen) {
3313                         forwardedLength.setElementAt(currentLen, jmpDest);
3314                     }
3315                 }
3316             }
3317             break;
3318 
3319         case URX_BACKTRACK:
3320             {
3321                 // Back-tracks are kind of like a branch, except that the min length was
3322                 //   propagated already, by the state save.
3323                 currentLen = forwardedLength.elementAti(loc+1);
3324             }
3325             break;
3326 
3327 
3328         case URX_STATE_SAVE:
3329             {
3330                 // State Save, for forward jumps, propagate the current minimum.
3331                 //             of the state save.
3332                 int32_t  jmpDest = URX_VAL(op);
3333                 if (jmpDest > loc) {
3334                     if (currentLen < forwardedLength.elementAti(jmpDest)) {
3335                         forwardedLength.setElementAt(currentLen, jmpDest);
3336                     }
3337                 }
3338             }
3339             break;
3340 
3341 
3342         case URX_STRING:
3343             {
3344                 loc++;
3345                 int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3346                 currentLen = safeIncrement(currentLen, URX_VAL(stringLenOp));
3347             }
3348             break;
3349 
3350 
3351         case URX_STRING_I:
3352             {
3353                 loc++;
3354                 // TODO: with full case folding, matching input text may be shorter than
3355                 //       the string we have here.  More smarts could put some bounds on it.
3356                 //       Assume a min length of one for now.  A min length of zero causes
3357                 //        optimization failures for a pattern like "string"+
3358                 // currentLen += URX_VAL(stringLenOp);
3359                 currentLen = safeIncrement(currentLen, 1);
3360             }
3361             break;
3362 
3363         case URX_CTR_INIT:
3364         case URX_CTR_INIT_NG:
3365             {
3366                 // Loop Init Ops.
3367                 //   If the min loop count == 0
3368                 //      move loc forwards to the end of the loop, skipping over the body.
3369                 //   If the min count is > 0,
3370                 //      continue normal processing of the body of the loop.
3371                 int32_t loopEndLoc   = (int32_t)fRXPat->fCompiledPat->elementAti(loc+1);
3372                         loopEndLoc   = URX_VAL(loopEndLoc);
3373                 int32_t minLoopCount = (int32_t)fRXPat->fCompiledPat->elementAti(loc+2);
3374                 if (minLoopCount == 0) {
3375                     loc = loopEndLoc;
3376                 } else {
3377                     loc+=3;  // Skips over operands of CTR_INIT
3378                 }
3379             }
3380             break;
3381 
3382 
3383         case URX_CTR_LOOP:
3384         case URX_CTR_LOOP_NG:
3385             // Loop ops.
3386             //  The jump is conditional, backwards only.
3387             break;
3388 
3389         case URX_LOOP_SR_I:
3390         case URX_LOOP_DOT_I:
3391         case URX_LOOP_C:
3392             // More loop ops.  These state-save to themselves.
3393             //   don't change the minimum match - could match nothing at all.
3394             break;
3395 
3396 
3397         case URX_LA_START:
3398         case URX_LB_START:
3399             {
3400                 // Look-around.  Scan forward until the matching look-ahead end,
3401                 //   without processing the look-around block.  This is overly pessimistic for look-ahead,
3402                 //   it assumes that the look-ahead match might be zero-length.
3403                 //   TODO:  Positive lookahead could recursively do the block, then continue
3404                 //          with the longer of the block or the value coming in.  Ticket 6060
3405                 int32_t  depth = (opType == URX_LA_START? 2: 1);
3406                 for (;;) {
3407                     loc++;
3408                     op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3409                     if (URX_TYPE(op) == URX_LA_START) {
3410                         // The boilerplate for look-ahead includes two LA_END instructions,
3411                         //    Depth will be decremented by each one when it is seen.
3412                         depth += 2;
3413                     }
3414                     if (URX_TYPE(op) == URX_LB_START) {
3415                         depth++;
3416                     }
3417                     if (URX_TYPE(op) == URX_LA_END) {
3418                         depth--;
3419                         if (depth == 0) {
3420                             break;
3421                         }
3422                     }
3423                     if (URX_TYPE(op)==URX_LBN_END) {
3424                         depth--;
3425                         if (depth == 0) {
3426                             break;
3427                         }
3428                     }
3429                     if (URX_TYPE(op) == URX_STATE_SAVE) {
3430                         // Need this because neg lookahead blocks will FAIL to outside
3431                         //   of the block.
3432                         int32_t  jmpDest = URX_VAL(op);
3433                         if (jmpDest > loc) {
3434                             if (currentLen < forwardedLength.elementAti(jmpDest)) {
3435                                 forwardedLength.setElementAt(currentLen, jmpDest);
3436                             }
3437                         }
3438                     }
3439                     U_ASSERT(loc <= end);
3440                 }
3441             }
3442             break;
3443 
3444         case URX_LA_END:
3445         case URX_LB_CONT:
3446         case URX_LB_END:
3447         case URX_LBN_CONT:
3448         case URX_LBN_END:
3449             // Only come here if the matching URX_LA_START or URX_LB_START was not in the
3450             //   range being sized, which happens when measuring size of look-behind blocks.
3451             break;
3452 
3453         default:
3454             UPRV_UNREACHABLE_EXIT;
3455             }
3456 
3457         }
3458 
3459     // We have finished walking through the ops.  Check whether some forward jump
3460     //   propagated a shorter length to location end+1.
3461     if (forwardedLength.elementAti(end+1) < currentLen) {
3462         currentLen = forwardedLength.elementAti(end+1);
3463         U_ASSERT(currentLen>=0 && currentLen < INT32_MAX);
3464     }
3465 
3466     return currentLen;
3467 }
3468 
3469 //------------------------------------------------------------------------------
3470 //
3471 //   maxMatchLength    Calculate the length of the longest string that could
3472 //                     match the specified pattern.
3473 //                     Length is in 16 bit code units, not code points.
3474 //
3475 //                     The calculated length may not be exact.  The returned
3476 //                     value may be longer than the actual maximum; it must
3477 //                     never be shorter.
3478 //
3479 //                     start, end: the range of the pattern to check.
3480 //                     end is inclusive.
3481 //
3482 //------------------------------------------------------------------------------
maxMatchLength(int32_t start,int32_t end)3483 int32_t   RegexCompile::maxMatchLength(int32_t start, int32_t end) {
3484     if (U_FAILURE(*fStatus)) {
3485         return 0;
3486     }
3487     U_ASSERT(start <= end);
3488     U_ASSERT(end < fRXPat->fCompiledPat->size());
3489 
3490     int32_t    loc;
3491     int32_t    op;
3492     int32_t    opType;
3493     int32_t    currentLen = 0;
3494     UVector32  forwardedLength(end+1, *fStatus);
3495     forwardedLength.setSize(end+1);
3496 
3497     for (loc=start; loc<=end; loc++) {
3498         forwardedLength.setElementAt(0, loc);
3499     }
3500 
3501     for (loc = start; loc<=end; loc++) {
3502         op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3503         opType = URX_TYPE(op);
3504 
3505         // The loop is advancing linearly through the pattern.
3506         // If the op we are now at was the destination of a branch in the pattern,
3507         // and that path has a longer maximum length than the current accumulated value,
3508         // replace the current accumulated value.
3509         if (forwardedLength.elementAti(loc) > currentLen) {
3510             currentLen = forwardedLength.elementAti(loc);
3511         }
3512 
3513         switch (opType) {
3514             // Ops that don't change the total length matched
3515         case URX_RESERVED_OP:
3516         case URX_END:
3517         case URX_STRING_LEN:
3518         case URX_NOP:
3519         case URX_START_CAPTURE:
3520         case URX_END_CAPTURE:
3521         case URX_BACKSLASH_B:
3522         case URX_BACKSLASH_BU:
3523         case URX_BACKSLASH_G:
3524         case URX_BACKSLASH_Z:
3525         case URX_CARET:
3526         case URX_DOLLAR:
3527         case URX_DOLLAR_M:
3528         case URX_DOLLAR_D:
3529         case URX_DOLLAR_MD:
3530         case URX_RELOC_OPRND:
3531         case URX_STO_INP_LOC:
3532         case URX_CARET_M:
3533         case URX_CARET_M_UNIX:
3534 
3535         case URX_STO_SP:          // Setup for atomic or possessive blocks.  Doesn't change what can match.
3536         case URX_LD_SP:
3537 
3538         case URX_LB_END:
3539         case URX_LB_CONT:
3540         case URX_LBN_CONT:
3541         case URX_LBN_END:
3542             break;
3543 
3544 
3545             // Ops that increase that cause an unbounded increase in the length
3546             //   of a matched string, or that increase it a hard to characterize way.
3547             //   Call the max length unbounded, and stop further checking.
3548         case URX_BACKREF:         // BackRef.  Must assume that it might be a zero length match
3549         case URX_BACKREF_I:
3550         case URX_BACKSLASH_X:   // Grapheme Cluster.  Minimum is 1, max unbounded.
3551             currentLen = INT32_MAX;
3552             break;
3553 
3554 
3555             // Ops that match a max of one character (possibly two 16 bit code units.)
3556             //
3557         case URX_STATIC_SETREF:
3558         case URX_STAT_SETREF_N:
3559         case URX_SETREF:
3560         case URX_BACKSLASH_D:
3561         case URX_BACKSLASH_H:
3562         case URX_BACKSLASH_R:
3563         case URX_BACKSLASH_V:
3564         case URX_ONECHAR_I:
3565         case URX_DOTANY_ALL:
3566         case URX_DOTANY:
3567         case URX_DOTANY_UNIX:
3568             currentLen = safeIncrement(currentLen, 2);
3569             break;
3570 
3571             // Single literal character.  Increase current max length by one or two,
3572             //       depending on whether the char is in the supplementary range.
3573         case URX_ONECHAR:
3574             currentLen = safeIncrement(currentLen, 1);
3575             if (URX_VAL(op) > 0x10000) {
3576                 currentLen = safeIncrement(currentLen, 1);
3577             }
3578             break;
3579 
3580             // Jumps.
3581             //
3582         case URX_JMP:
3583         case URX_JMPX:
3584         case URX_JMP_SAV:
3585         case URX_JMP_SAV_X:
3586             {
3587                 int32_t  jmpDest = URX_VAL(op);
3588                 if (jmpDest < loc) {
3589                     // Loop of some kind.  Max match length is unbounded.
3590                     currentLen = INT32_MAX;
3591                 } else {
3592                     // Forward jump.  Propagate the current min length to the target loc of the jump.
3593                     if (forwardedLength.elementAti(jmpDest) < currentLen) {
3594                         forwardedLength.setElementAt(currentLen, jmpDest);
3595                     }
3596                     currentLen = 0;
3597                 }
3598             }
3599             break;
3600 
3601         case URX_BACKTRACK:
3602             // back-tracks are kind of like a branch, except that the max length was
3603             //   propagated already, by the state save.
3604             currentLen = forwardedLength.elementAti(loc+1);
3605             break;
3606 
3607 
3608         case URX_STATE_SAVE:
3609             {
3610                 // State Save, for forward jumps, propagate the current minimum.
3611                 //               of the state save.
3612                 //             For backwards jumps, they create a loop, maximum
3613                 //               match length is unbounded.
3614                 int32_t  jmpDest = URX_VAL(op);
3615                 if (jmpDest > loc) {
3616                     if (currentLen > forwardedLength.elementAti(jmpDest)) {
3617                         forwardedLength.setElementAt(currentLen, jmpDest);
3618                     }
3619                 } else {
3620                     currentLen = INT32_MAX;
3621                 }
3622             }
3623             break;
3624 
3625 
3626 
3627 
3628         case URX_STRING:
3629             {
3630                 loc++;
3631                 int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3632                 currentLen = safeIncrement(currentLen, URX_VAL(stringLenOp));
3633                 break;
3634             }
3635 
3636         case URX_STRING_I:
3637             // TODO:  This code assumes that any user string that matches will be no longer
3638             //        than our compiled string, with case insensitive matching.
3639             //        Our compiled string has been case-folded already.
3640             //
3641             //        Any matching user string will have no more code points than our
3642             //        compiled (folded) string.  Folding may add code points, but
3643             //        not remove them.
3644             //
3645             //        There is a potential problem if a supplemental code point
3646             //        case-folds to a BMP code point.  In this case our compiled string
3647             //        could be shorter (in code units) than a matching user string.
3648             //
3649             //        At this time (Unicode 6.1) there are no such characters, and this case
3650             //        is not being handled.  A test, intltest regex/Bug9283, will fail if
3651             //        any problematic characters are added to Unicode.
3652             //
3653             //        If this happens, we can make a set of the BMP chars that the
3654             //        troublesome supplementals fold to, scan our string, and bump the
3655             //        currentLen one extra for each that is found.
3656             //
3657             {
3658                 loc++;
3659                 int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3660                 currentLen = safeIncrement(currentLen, URX_VAL(stringLenOp));
3661             }
3662             break;
3663 
3664         case URX_CTR_INIT:
3665         case URX_CTR_INIT_NG:
3666             // For Loops, recursively call this function on the pattern for the loop body,
3667             //   then multiply the result by the maximum loop count.
3668             {
3669                 int32_t  loopEndLoc = URX_VAL(fRXPat->fCompiledPat->elementAti(loc+1));
3670                 if (loopEndLoc == loc+4) {
3671                     // Loop has an empty body. No affect on max match length.
3672                     // Continue processing with code after the loop end.
3673                     loc = loopEndLoc;
3674                     break;
3675                 }
3676 
3677                 int32_t maxLoopCount = static_cast<int32_t>(fRXPat->fCompiledPat->elementAti(loc+3));
3678                 if (maxLoopCount == -1) {
3679                     // Unbounded Loop. No upper bound on match length.
3680                     currentLen = INT32_MAX;
3681                     break;
3682                 }
3683 
3684                 U_ASSERT(loopEndLoc >= loc+4);
3685                 int64_t blockLen = maxMatchLength(loc+4, loopEndLoc-1);  // Recursive call.
3686                 int64_t updatedLen = (int64_t)currentLen + blockLen * maxLoopCount;
3687                 if (updatedLen >= INT32_MAX) {
3688                     currentLen = INT32_MAX;
3689                     break;
3690                 }
3691                 currentLen = (int32_t)updatedLen;
3692                 loc = loopEndLoc;
3693                 break;
3694             }
3695 
3696         case URX_CTR_LOOP:
3697         case URX_CTR_LOOP_NG:
3698             // These opcodes will be skipped over by code for URX_CTR_INIT.
3699             // We shouldn't encounter them here.
3700             UPRV_UNREACHABLE_EXIT;
3701 
3702         case URX_LOOP_SR_I:
3703         case URX_LOOP_DOT_I:
3704         case URX_LOOP_C:
3705             // For anything to do with loops, make the match length unbounded.
3706             currentLen = INT32_MAX;
3707             break;
3708 
3709 
3710 
3711         case URX_LA_START:
3712         case URX_LA_END:
3713             // Look-ahead.  Just ignore, treat the look-ahead block as if
3714             // it were normal pattern.  Gives a too-long match length,
3715             //  but good enough for now.
3716             break;
3717 
3718             // End of look-ahead ops should always be consumed by the processing at
3719             //  the URX_LA_START op.
3720             // UPRV_UNREACHABLE_EXIT;
3721 
3722         case URX_LB_START:
3723             {
3724                 // Look-behind.  Scan forward until the matching look-around end,
3725                 //   without processing the look-behind block.
3726                 int32_t dataLoc = URX_VAL(op);
3727                 for (loc = loc + 1; loc <= end; ++loc) {
3728                     op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3729                     int32_t opType = URX_TYPE(op);
3730                     if ((opType == URX_LA_END || opType == URX_LBN_END) && (URX_VAL(op) == dataLoc)) {
3731                         break;
3732                     }
3733                 }
3734                 U_ASSERT(loc <= end);
3735             }
3736             break;
3737 
3738         default:
3739             UPRV_UNREACHABLE_EXIT;
3740         }
3741 
3742 
3743         if (currentLen == INT32_MAX) {
3744             //  The maximum length is unbounded.
3745             //  Stop further processing of the pattern.
3746             break;
3747         }
3748 
3749     }
3750     return currentLen;
3751 
3752 }
3753 
3754 
3755 //------------------------------------------------------------------------------
3756 //
3757 //   stripNOPs    Remove any NOP operations from the compiled pattern code.
3758 //                Extra NOPs are inserted for some constructs during the initial
3759 //                code generation to provide locations that may be patched later.
3760 //                Many end up unneeded, and are removed by this function.
3761 //
3762 //                In order to minimize the number of passes through the pattern,
3763 //                back-reference fixup is also performed here (adjusting
3764 //                back-reference operands to point to the correct frame offsets).
3765 //
3766 //------------------------------------------------------------------------------
stripNOPs()3767 void RegexCompile::stripNOPs() {
3768 
3769     if (U_FAILURE(*fStatus)) {
3770         return;
3771     }
3772 
3773     int32_t    end = fRXPat->fCompiledPat->size();
3774     UVector32  deltas(end, *fStatus);
3775 
3776     // Make a first pass over the code, computing the amount that things
3777     //   will be offset at each location in the original code.
3778     int32_t   loc;
3779     int32_t   d = 0;
3780     for (loc=0; loc<end; loc++) {
3781         deltas.addElement(d, *fStatus);
3782         int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3783         if (URX_TYPE(op) == URX_NOP) {
3784             d++;
3785         }
3786     }
3787 
3788     UnicodeString caseStringBuffer;
3789 
3790     // Make a second pass over the code, removing the NOPs by moving following
3791     //  code up, and patching operands that refer to code locations that
3792     //  are being moved.  The array of offsets from the first step is used
3793     //  to compute the new operand values.
3794     int32_t src;
3795     int32_t dst = 0;
3796     for (src=0; src<end; src++) {
3797         int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(src);
3798         int32_t opType = URX_TYPE(op);
3799         switch (opType) {
3800         case URX_NOP:
3801             break;
3802 
3803         case URX_STATE_SAVE:
3804         case URX_JMP:
3805         case URX_CTR_LOOP:
3806         case URX_CTR_LOOP_NG:
3807         case URX_RELOC_OPRND:
3808         case URX_JMPX:
3809         case URX_JMP_SAV:
3810         case URX_JMP_SAV_X:
3811             // These are instructions with operands that refer to code locations.
3812             {
3813                 int32_t  operandAddress = URX_VAL(op);
3814                 U_ASSERT(operandAddress>=0 && operandAddress<deltas.size());
3815                 int32_t fixedOperandAddress = operandAddress - deltas.elementAti(operandAddress);
3816                 op = buildOp(opType, fixedOperandAddress);
3817                 fRXPat->fCompiledPat->setElementAt(op, dst);
3818                 dst++;
3819                 break;
3820             }
3821 
3822         case URX_BACKREF:
3823         case URX_BACKREF_I:
3824             {
3825                 int32_t where = URX_VAL(op);
3826                 if (where > fRXPat->fGroupMap->size()) {
3827                     error(U_REGEX_INVALID_BACK_REF);
3828                     break;
3829                 }
3830                 where = fRXPat->fGroupMap->elementAti(where-1);
3831                 op    = buildOp(opType, where);
3832                 fRXPat->fCompiledPat->setElementAt(op, dst);
3833                 dst++;
3834 
3835                 fRXPat->fNeedsAltInput = true;
3836                 break;
3837             }
3838         case URX_RESERVED_OP:
3839         case URX_RESERVED_OP_N:
3840         case URX_BACKTRACK:
3841         case URX_END:
3842         case URX_ONECHAR:
3843         case URX_STRING:
3844         case URX_STRING_LEN:
3845         case URX_START_CAPTURE:
3846         case URX_END_CAPTURE:
3847         case URX_STATIC_SETREF:
3848         case URX_STAT_SETREF_N:
3849         case URX_SETREF:
3850         case URX_DOTANY:
3851         case URX_FAIL:
3852         case URX_BACKSLASH_B:
3853         case URX_BACKSLASH_BU:
3854         case URX_BACKSLASH_G:
3855         case URX_BACKSLASH_X:
3856         case URX_BACKSLASH_Z:
3857         case URX_DOTANY_ALL:
3858         case URX_BACKSLASH_D:
3859         case URX_CARET:
3860         case URX_DOLLAR:
3861         case URX_CTR_INIT:
3862         case URX_CTR_INIT_NG:
3863         case URX_DOTANY_UNIX:
3864         case URX_STO_SP:
3865         case URX_LD_SP:
3866         case URX_STO_INP_LOC:
3867         case URX_LA_START:
3868         case URX_LA_END:
3869         case URX_ONECHAR_I:
3870         case URX_STRING_I:
3871         case URX_DOLLAR_M:
3872         case URX_CARET_M:
3873         case URX_CARET_M_UNIX:
3874         case URX_LB_START:
3875         case URX_LB_CONT:
3876         case URX_LB_END:
3877         case URX_LBN_CONT:
3878         case URX_LBN_END:
3879         case URX_LOOP_SR_I:
3880         case URX_LOOP_DOT_I:
3881         case URX_LOOP_C:
3882         case URX_DOLLAR_D:
3883         case URX_DOLLAR_MD:
3884         case URX_BACKSLASH_H:
3885         case URX_BACKSLASH_R:
3886         case URX_BACKSLASH_V:
3887             // These instructions are unaltered by the relocation.
3888             fRXPat->fCompiledPat->setElementAt(op, dst);
3889             dst++;
3890             break;
3891 
3892         default:
3893             // Some op is unaccounted for.
3894             UPRV_UNREACHABLE_EXIT;
3895         }
3896     }
3897 
3898     fRXPat->fCompiledPat->setSize(dst);
3899 }
3900 
3901 
3902 
3903 
3904 //------------------------------------------------------------------------------
3905 //
3906 //  Error         Report a rule parse error.
3907 //                Only report it if no previous error has been recorded.
3908 //
3909 //------------------------------------------------------------------------------
error(UErrorCode e)3910 void RegexCompile::error(UErrorCode e) {
3911     if (U_SUCCESS(*fStatus) || e == U_MEMORY_ALLOCATION_ERROR) {
3912         *fStatus = e;
3913         // Hmm. fParseErr (UParseError) line & offset fields are int32_t in public
3914         // API (see common/unicode/parseerr.h), while fLineNum and fCharNum are
3915         // int64_t. If the values of the latter are out of range for the former,
3916         // set them to the appropriate "field not supported" values.
3917         if (fLineNum > 0x7FFFFFFF) {
3918             fParseErr->line   = 0;
3919             fParseErr->offset = -1;
3920         } else if (fCharNum > 0x7FFFFFFF) {
3921             fParseErr->line   = (int32_t)fLineNum;
3922             fParseErr->offset = -1;
3923         } else {
3924             fParseErr->line   = (int32_t)fLineNum;
3925             fParseErr->offset = (int32_t)fCharNum;
3926         }
3927 
3928         UErrorCode status = U_ZERO_ERROR; // throwaway status for extracting context
3929 
3930         // Fill in the context.
3931         //   Note: extractBetween() pins supplied indices to the string bounds.
3932         uprv_memset(fParseErr->preContext,  0, sizeof(fParseErr->preContext));
3933         uprv_memset(fParseErr->postContext, 0, sizeof(fParseErr->postContext));
3934         utext_extract(fRXPat->fPattern, fScanIndex-U_PARSE_CONTEXT_LEN+1, fScanIndex, fParseErr->preContext, U_PARSE_CONTEXT_LEN, &status);
3935         utext_extract(fRXPat->fPattern, fScanIndex, fScanIndex+U_PARSE_CONTEXT_LEN-1, fParseErr->postContext, U_PARSE_CONTEXT_LEN, &status);
3936     }
3937 }
3938 
3939 
3940 //
3941 //  Assorted Unicode character constants.
3942 //     Numeric because there is no portable way to enter them as literals.
3943 //     (Think EBCDIC).
3944 //
3945 static const char16_t   chCR        = 0x0d;      // New lines, for terminating comments.
3946 static const char16_t   chLF        = 0x0a;      // Line Feed
3947 static const char16_t   chPound     = 0x23;      // '#', introduces a comment.
3948 static const char16_t   chDigit0    = 0x30;      // '0'
3949 static const char16_t   chDigit7    = 0x37;      // '9'
3950 static const char16_t   chColon     = 0x3A;      // ':'
3951 static const char16_t   chE         = 0x45;      // 'E'
3952 static const char16_t   chQ         = 0x51;      // 'Q'
3953 //static const char16_t   chN         = 0x4E;      // 'N'
3954 static const char16_t   chP         = 0x50;      // 'P'
3955 static const char16_t   chBackSlash = 0x5c;      // '\'  introduces a char escape
3956 //static const char16_t   chLBracket  = 0x5b;      // '['
3957 static const char16_t   chRBracket  = 0x5d;      // ']'
3958 static const char16_t   chUp        = 0x5e;      // '^'
3959 static const char16_t   chLowerP    = 0x70;
3960 static const char16_t   chLBrace    = 0x7b;      // '{'
3961 static const char16_t   chRBrace    = 0x7d;      // '}'
3962 static const char16_t   chNEL       = 0x85;      //    NEL newline variant
3963 static const char16_t   chLS        = 0x2028;    //    Unicode Line Separator
3964 
3965 
3966 //------------------------------------------------------------------------------
3967 //
3968 //  nextCharLL    Low Level Next Char from the regex pattern.
3969 //                Get a char from the string, keep track of input position
3970 //                     for error reporting.
3971 //
3972 //------------------------------------------------------------------------------
nextCharLL()3973 UChar32  RegexCompile::nextCharLL() {
3974     UChar32       ch;
3975 
3976     if (fPeekChar != -1) {
3977         ch = fPeekChar;
3978         fPeekChar = -1;
3979         return ch;
3980     }
3981 
3982     // assume we're already in the right place
3983     ch = UTEXT_NEXT32(fRXPat->fPattern);
3984     if (ch == U_SENTINEL) {
3985         return ch;
3986     }
3987 
3988     if (ch == chCR ||
3989         ch == chNEL ||
3990         ch == chLS   ||
3991         (ch == chLF && fLastChar != chCR)) {
3992         // Character is starting a new line.  Bump up the line number, and
3993         //  reset the column to 0.
3994         fLineNum++;
3995         fCharNum=0;
3996     }
3997     else {
3998         // Character is not starting a new line.  Except in the case of a
3999         //   LF following a CR, increment the column position.
4000         if (ch != chLF) {
4001             fCharNum++;
4002         }
4003     }
4004     fLastChar = ch;
4005     return ch;
4006 }
4007 
4008 //------------------------------------------------------------------------------
4009 //
4010 //   peekCharLL    Low Level Character Scanning, sneak a peek at the next
4011 //                 character without actually getting it.
4012 //
4013 //------------------------------------------------------------------------------
peekCharLL()4014 UChar32  RegexCompile::peekCharLL() {
4015     if (fPeekChar == -1) {
4016         fPeekChar = nextCharLL();
4017     }
4018     return fPeekChar;
4019 }
4020 
4021 
4022 //------------------------------------------------------------------------------
4023 //
4024 //   nextChar     for pattern scanning.  At this level, we handle stripping
4025 //                out comments and processing some backslash character escapes.
4026 //                The rest of the pattern grammar is handled at the next level up.
4027 //
4028 //------------------------------------------------------------------------------
nextChar(RegexPatternChar & c)4029 void RegexCompile::nextChar(RegexPatternChar &c) {
4030   tailRecursion:
4031     fScanIndex = UTEXT_GETNATIVEINDEX(fRXPat->fPattern);
4032     c.fChar    = nextCharLL();
4033     c.fQuoted  = false;
4034 
4035     if (fQuoteMode) {
4036         c.fQuoted = true;
4037         if ((c.fChar==chBackSlash && peekCharLL()==chE && ((fModeFlags & UREGEX_LITERAL) == 0)) ||
4038             c.fChar == (UChar32)-1) {
4039             fQuoteMode = false;  //  Exit quote mode,
4040             nextCharLL();        // discard the E
4041             // nextChar(c);      // recurse to get the real next char
4042             goto tailRecursion;  // Note: fuzz testing produced testcases that
4043                                  //       resulted in stack overflow here.
4044         }
4045     }
4046     else if (fInBackslashQuote) {
4047         // The current character immediately follows a '\'
4048         // Don't check for any further escapes, just return it as-is.
4049         // Don't set c.fQuoted, because that would prevent the state machine from
4050         //    dispatching on the character.
4051         fInBackslashQuote = false;
4052     }
4053     else
4054     {
4055         // We are not in a \Q quoted region \E of the source.
4056         //
4057         if (fModeFlags & UREGEX_COMMENTS) {
4058             //
4059             // We are in free-spacing and comments mode.
4060             //  Scan through any white space and comments, until we
4061             //  reach a significant character or the end of input.
4062             for (;;) {
4063                 if (c.fChar == (UChar32)-1) {
4064                     break;     // End of Input
4065                 }
4066                 if  (c.fChar == chPound && fEOLComments) {
4067                     // Start of a comment.  Consume the rest of it, until EOF or a new line
4068                     for (;;) {
4069                         c.fChar = nextCharLL();
4070                         if (c.fChar == (UChar32)-1 ||  // EOF
4071                             c.fChar == chCR        ||
4072                             c.fChar == chLF        ||
4073                             c.fChar == chNEL       ||
4074                             c.fChar == chLS)       {
4075                             break;
4076                         }
4077                     }
4078                 }
4079                 // TODO:  check what Java & Perl do with non-ASCII white spaces.  Ticket 6061.
4080                 if (PatternProps::isWhiteSpace(c.fChar) == false) {
4081                     break;
4082                 }
4083                 c.fChar = nextCharLL();
4084             }
4085         }
4086 
4087         //
4088         //  check for backslash escaped characters.
4089         //
4090         if (c.fChar == chBackSlash) {
4091             int64_t pos = UTEXT_GETNATIVEINDEX(fRXPat->fPattern);
4092             if (RegexStaticSets::gStaticSets->fUnescapeCharSet.contains(peekCharLL())) {
4093                 //
4094                 // A '\' sequence that is handled by ICU's standard unescapeAt function.
4095                 //   Includes \uxxxx, \n, \r, many others.
4096                 //   Return the single equivalent character.
4097                 //
4098                 nextCharLL();                 // get & discard the peeked char.
4099                 c.fQuoted = true;
4100 
4101                 if (UTEXT_FULL_TEXT_IN_CHUNK(fRXPat->fPattern, fPatternLength)) {
4102                     int32_t endIndex = (int32_t)pos;
4103                     c.fChar = u_unescapeAt(uregex_ucstr_unescape_charAt, &endIndex, (int32_t)fPatternLength, (void *)fRXPat->fPattern->chunkContents);
4104 
4105                     if (endIndex == pos) {
4106                         error(U_REGEX_BAD_ESCAPE_SEQUENCE);
4107                     }
4108                     fCharNum += endIndex - pos;
4109                     UTEXT_SETNATIVEINDEX(fRXPat->fPattern, endIndex);
4110                 } else {
4111                     int32_t offset = 0;
4112                     struct URegexUTextUnescapeCharContext context = U_REGEX_UTEXT_UNESCAPE_CONTEXT(fRXPat->fPattern);
4113 
4114                     UTEXT_SETNATIVEINDEX(fRXPat->fPattern, pos);
4115                     c.fChar = u_unescapeAt(uregex_utext_unescape_charAt, &offset, INT32_MAX, &context);
4116 
4117                     if (offset == 0) {
4118                         error(U_REGEX_BAD_ESCAPE_SEQUENCE);
4119                     } else if (context.lastOffset == offset) {
4120                         UTEXT_PREVIOUS32(fRXPat->fPattern);
4121                     } else if (context.lastOffset != offset-1) {
4122                         utext_moveIndex32(fRXPat->fPattern, offset - context.lastOffset - 1);
4123                     }
4124                     fCharNum += offset;
4125                 }
4126             }
4127             else if (peekCharLL() == chDigit0) {
4128                 //  Octal Escape, using Java Regexp Conventions
4129                 //    which are \0 followed by 1-3 octal digits.
4130                 //    Different from ICU Unescape handling of Octal, which does not
4131                 //    require the leading 0.
4132                 //  Java also has the convention of only consuming 2 octal digits if
4133                 //    the three digit number would be > 0xff
4134                 //
4135                 c.fChar = 0;
4136                 nextCharLL();    // Consume the initial 0.
4137                 int index;
4138                 for (index=0; index<3; index++) {
4139                     int32_t ch = peekCharLL();
4140                     if (ch<chDigit0 || ch>chDigit7) {
4141                         if (index==0) {
4142                            // \0 is not followed by any octal digits.
4143                            error(U_REGEX_BAD_ESCAPE_SEQUENCE);
4144                         }
4145                         break;
4146                     }
4147                     c.fChar <<= 3;
4148                     c.fChar += ch&7;
4149                     if (c.fChar <= 255) {
4150                         nextCharLL();
4151                     } else {
4152                         // The last digit made the number too big.  Forget we saw it.
4153                         c.fChar >>= 3;
4154                     }
4155                 }
4156                 c.fQuoted = true;
4157             }
4158             else if (peekCharLL() == chQ) {
4159                 //  "\Q"  enter quote mode, which will continue until "\E"
4160                 fQuoteMode = true;
4161                 nextCharLL();        // discard the 'Q'.
4162                 // nextChar(c);      // recurse to get the real next char.
4163                 goto tailRecursion;  // Note: fuzz testing produced test cases that
4164                 //                            resulted in stack overflow here.
4165             }
4166             else
4167             {
4168                 // We are in a '\' escape that will be handled by the state table scanner.
4169                 // Just return the backslash, but remember that the following char is to
4170                 //  be taken literally.
4171                 fInBackslashQuote = true;
4172             }
4173         }
4174     }
4175 
4176     // re-enable # to end-of-line comments, in case they were disabled.
4177     // They are disabled by the parser upon seeing '(?', but this lasts for
4178     //  the fetching of the next character only.
4179     fEOLComments = true;
4180 
4181     // putc(c.fChar, stdout);
4182 }
4183 
4184 
4185 
4186 //------------------------------------------------------------------------------
4187 //
4188 //  scanNamedChar
4189 //            Get a UChar32 from a \N{UNICODE CHARACTER NAME} in the pattern.
4190 //
4191 //             The scan position will be at the 'N'.  On return
4192 //             the scan position should be just after the '}'
4193 //
4194 //             Return the UChar32
4195 //
4196 //------------------------------------------------------------------------------
scanNamedChar()4197 UChar32  RegexCompile::scanNamedChar() {
4198     if (U_FAILURE(*fStatus)) {
4199         return 0;
4200     }
4201 
4202     nextChar(fC);
4203     if (fC.fChar != chLBrace) {
4204         error(U_REGEX_PROPERTY_SYNTAX);
4205         return 0;
4206     }
4207 
4208     UnicodeString  charName;
4209     for (;;) {
4210         nextChar(fC);
4211         if (fC.fChar == chRBrace) {
4212             break;
4213         }
4214         if (fC.fChar == -1) {
4215             error(U_REGEX_PROPERTY_SYNTAX);
4216             return 0;
4217         }
4218         charName.append(fC.fChar);
4219     }
4220 
4221     char name[100];
4222     if (!uprv_isInvariantUString(charName.getBuffer(), charName.length()) ||
4223          (uint32_t)charName.length()>=sizeof(name)) {
4224         // All Unicode character names have only invariant characters.
4225         // The API to get a character, given a name, accepts only char *, forcing us to convert,
4226         //   which requires this error check
4227         error(U_REGEX_PROPERTY_SYNTAX);
4228         return 0;
4229     }
4230     charName.extract(0, charName.length(), name, sizeof(name), US_INV);
4231 
4232     UChar32  theChar = u_charFromName(U_UNICODE_CHAR_NAME, name, fStatus);
4233     if (U_FAILURE(*fStatus)) {
4234         error(U_REGEX_PROPERTY_SYNTAX);
4235     }
4236 
4237     nextChar(fC);      // Continue overall regex pattern processing with char after the '}'
4238     return theChar;
4239 }
4240 
4241 //------------------------------------------------------------------------------
4242 //
4243 //  scanProp   Construct a UnicodeSet from the text at the current scan
4244 //             position, which will be of the form \p{whaterver}
4245 //
4246 //             The scan position will be at the 'p' or 'P'.  On return
4247 //             the scan position should be just after the '}'
4248 //
4249 //             Return a UnicodeSet, constructed from the \P pattern,
4250 //             or nullptr if the pattern is invalid.
4251 //
4252 //------------------------------------------------------------------------------
scanProp()4253 UnicodeSet *RegexCompile::scanProp() {
4254     UnicodeSet    *uset = nullptr;
4255 
4256     if (U_FAILURE(*fStatus)) {
4257         return nullptr;
4258     }
4259     (void)chLowerP;   // Suppress compiler unused variable warning.
4260     U_ASSERT(fC.fChar == chLowerP || fC.fChar == chP);
4261     UBool negated = (fC.fChar == chP);
4262 
4263     UnicodeString propertyName;
4264     nextChar(fC);
4265     if (fC.fChar != chLBrace) {
4266         error(U_REGEX_PROPERTY_SYNTAX);
4267         return nullptr;
4268     }
4269     for (;;) {
4270         nextChar(fC);
4271         if (fC.fChar == chRBrace) {
4272             break;
4273         }
4274         if (fC.fChar == -1) {
4275             // Hit the end of the input string without finding the closing '}'
4276             error(U_REGEX_PROPERTY_SYNTAX);
4277             return nullptr;
4278         }
4279         propertyName.append(fC.fChar);
4280     }
4281     uset = createSetForProperty(propertyName, negated);
4282     nextChar(fC);    // Move input scan to position following the closing '}'
4283     return uset;
4284 }
4285 
4286 //------------------------------------------------------------------------------
4287 //
4288 //  scanPosixProp   Construct a UnicodeSet from the text at the current scan
4289 //             position, which is expected be of the form [:property expression:]
4290 //
4291 //             The scan position will be at the opening ':'.  On return
4292 //             the scan position must be on the closing ']'
4293 //
4294 //             Return a UnicodeSet constructed from the pattern,
4295 //             or nullptr if this is not a valid POSIX-style set expression.
4296 //             If not a property expression, restore the initial scan position
4297 //                (to the opening ':')
4298 //
4299 //               Note:  the opening '[:' is not sufficient to guarantee that
4300 //                      this is a [:property:] expression.
4301 //                      [:'+=,] is a perfectly good ordinary set expression that
4302 //                              happens to include ':' as one of its characters.
4303 //
4304 //------------------------------------------------------------------------------
scanPosixProp()4305 UnicodeSet *RegexCompile::scanPosixProp() {
4306     UnicodeSet    *uset = nullptr;
4307 
4308     if (U_FAILURE(*fStatus)) {
4309         return nullptr;
4310     }
4311 
4312     U_ASSERT(fC.fChar == chColon);
4313 
4314     // Save the scanner state.
4315     // TODO:  move this into the scanner, with the state encapsulated in some way.  Ticket 6062
4316     int64_t     savedScanIndex        = fScanIndex;
4317     int64_t     savedNextIndex        = UTEXT_GETNATIVEINDEX(fRXPat->fPattern);
4318     UBool       savedQuoteMode        = fQuoteMode;
4319     UBool       savedInBackslashQuote = fInBackslashQuote;
4320     UBool       savedEOLComments      = fEOLComments;
4321     int64_t     savedLineNum          = fLineNum;
4322     int64_t     savedCharNum          = fCharNum;
4323     UChar32     savedLastChar         = fLastChar;
4324     UChar32     savedPeekChar         = fPeekChar;
4325     RegexPatternChar savedfC          = fC;
4326 
4327     // Scan for a closing ].   A little tricky because there are some perverse
4328     //   edge cases possible.  "[:abc\Qdef:] \E]"  is a valid non-property expression,
4329     //   ending on the second closing ].
4330 
4331     UnicodeString propName;
4332     UBool         negated  = false;
4333 
4334     // Check for and consume the '^' in a negated POSIX property, e.g.  [:^Letter:]
4335     nextChar(fC);
4336     if (fC.fChar == chUp) {
4337        negated = true;
4338        nextChar(fC);
4339     }
4340 
4341     // Scan for the closing ":]", collecting the property name along the way.
4342     UBool  sawPropSetTerminator = false;
4343     for (;;) {
4344         propName.append(fC.fChar);
4345         nextChar(fC);
4346         if (fC.fQuoted || fC.fChar == -1) {
4347             // Escaped characters or end of input - either says this isn't a [:Property:]
4348             break;
4349         }
4350         if (fC.fChar == chColon) {
4351             nextChar(fC);
4352             if (fC.fChar == chRBracket) {
4353                 sawPropSetTerminator = true;
4354             }
4355             break;
4356         }
4357     }
4358 
4359     if (sawPropSetTerminator) {
4360         uset = createSetForProperty(propName, negated);
4361     }
4362     else
4363     {
4364         // No closing ":]".
4365         //  Restore the original scan position.
4366         //  The main scanner will retry the input as a normal set expression,
4367         //    not a [:Property:] expression.
4368         fScanIndex        = savedScanIndex;
4369         fQuoteMode        = savedQuoteMode;
4370         fInBackslashQuote = savedInBackslashQuote;
4371         fEOLComments      = savedEOLComments;
4372         fLineNum          = savedLineNum;
4373         fCharNum          = savedCharNum;
4374         fLastChar         = savedLastChar;
4375         fPeekChar         = savedPeekChar;
4376         fC                = savedfC;
4377         UTEXT_SETNATIVEINDEX(fRXPat->fPattern, savedNextIndex);
4378     }
4379     return uset;
4380 }
4381 
addIdentifierIgnorable(UnicodeSet * set,UErrorCode & ec)4382 static inline void addIdentifierIgnorable(UnicodeSet *set, UErrorCode& ec) {
4383     set->add(0, 8).add(0x0e, 0x1b).add(0x7f, 0x9f);
4384     addCategory(set, U_GC_CF_MASK, ec);
4385 }
4386 
4387 //
4388 //  Create a Unicode Set from a Unicode Property expression.
4389 //     This is common code underlying both \p{...} and [:...:] expressions.
4390 //     Includes trying the Java "properties" that aren't supported as
4391 //     normal ICU UnicodeSet properties
4392 //
createSetForProperty(const UnicodeString & propName,UBool negated)4393 UnicodeSet *RegexCompile::createSetForProperty(const UnicodeString &propName, UBool negated) {
4394 
4395     if (U_FAILURE(*fStatus)) {
4396         return nullptr;
4397     }
4398     LocalPointer<UnicodeSet> set;
4399     UErrorCode status = U_ZERO_ERROR;
4400 
4401     do {      // non-loop, exists to allow breaks from the block.
4402         //
4403         //  First try the property as we received it
4404         //
4405         UnicodeString   setExpr;
4406         uint32_t        usetFlags = 0;
4407         setExpr.append(u"[\\p{", -1);
4408         setExpr.append(propName);
4409         setExpr.append(u"}]", -1);
4410         if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
4411             usetFlags |= USET_CASE_INSENSITIVE;
4412         }
4413         set.adoptInsteadAndCheckErrorCode(new UnicodeSet(setExpr, usetFlags, nullptr, status), status);
4414         if (U_SUCCESS(status) || status == U_MEMORY_ALLOCATION_ERROR) {
4415             break;
4416         }
4417 
4418         //
4419         //  The incoming property wasn't directly recognized by ICU.
4420 
4421         //  Check [:word:] and [:all:]. These are not recognized as a properties by ICU UnicodeSet.
4422         //     Java accepts 'word' with mixed case.
4423         //     Java accepts 'all' only in all lower case.
4424 
4425         status = U_ZERO_ERROR;
4426         if (propName.caseCompare(u"word", -1, 0) == 0) {
4427             set.adoptInsteadAndCheckErrorCode(
4428                 RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET].cloneAsThawed(), status);
4429             break;
4430         }
4431         if (propName.compare(u"all", -1) == 0) {
4432             set.adoptInsteadAndCheckErrorCode(new UnicodeSet(0, 0x10ffff), status);
4433             break;
4434         }
4435 
4436 
4437         //    Do Java InBlock expressions
4438         //
4439         UnicodeString mPropName = propName;
4440         if (mPropName.startsWith(u"In", 2) && mPropName.length() >= 3) {
4441             status = U_ZERO_ERROR;
4442             set.adoptInsteadAndCheckErrorCode(new UnicodeSet(), status);
4443             if (U_FAILURE(status)) {
4444                 break;
4445             }
4446             UnicodeString blockName(mPropName, 2);  // Property with the leading "In" removed.
4447             set->applyPropertyAlias(UnicodeString(u"Block"), blockName, status);
4448             break;
4449         }
4450 
4451         //  Check for the Java form "IsBooleanPropertyValue", which we will recast
4452         //  as "BooleanPropertyValue". The property value can be either a
4453         //  a General Category or a Script Name.
4454 
4455         if (propName.startsWith(u"Is", 2) && propName.length()>=3) {
4456             mPropName.remove(0, 2);      // Strip the "Is"
4457             if (mPropName.indexOf(u'=') >= 0) {
4458                 // Reject any "Is..." property expression containing an '=', that is,
4459                 // any non-binary property expression.
4460                 status = U_REGEX_PROPERTY_SYNTAX;
4461                 break;
4462             }
4463 
4464             if (mPropName.caseCompare(u"assigned", -1, 0) == 0) {
4465                 mPropName.setTo(u"unassigned", -1);
4466                 negated = !negated;
4467             } else if (mPropName.caseCompare(u"TitleCase", -1, 0) == 0) {
4468                 mPropName.setTo(u"Titlecase_Letter", -1);
4469             }
4470 
4471             mPropName.insert(0, u"[\\p{", -1);
4472             mPropName.append(u"}]", -1);
4473             set.adoptInsteadAndCheckErrorCode(new UnicodeSet(mPropName, *fStatus), status);
4474 
4475             if (U_SUCCESS(status) && !set->isEmpty() && (usetFlags & USET_CASE_INSENSITIVE)) {
4476                 set->closeOver(USET_CASE_INSENSITIVE);
4477             }
4478             break;
4479 
4480         }
4481 
4482         if (propName.startsWith(u"java", -1)) {
4483             status = U_ZERO_ERROR;
4484             set.adoptInsteadAndCheckErrorCode(new UnicodeSet(), status);
4485             if (U_FAILURE(status)) {
4486                 break;
4487             }
4488             //
4489             //  Try the various Java specific properties.
4490             //   These all begin with "java"
4491             //
4492             if (propName.compare(u"javaDefined", -1) == 0) {
4493                 addCategory(set.getAlias(), U_GC_CN_MASK, status);
4494                 set->complement();
4495             }
4496             else if (propName.compare(u"javaDigit", -1) == 0) {
4497                 addCategory(set.getAlias(), U_GC_ND_MASK, status);
4498             }
4499             else if (propName.compare(u"javaIdentifierIgnorable", -1) == 0) {
4500                 addIdentifierIgnorable(set.getAlias(), status);
4501             }
4502             else if (propName.compare(u"javaISOControl", -1) == 0) {
4503                 set->add(0, 0x1F).add(0x7F, 0x9F);
4504             }
4505             else if (propName.compare(u"javaJavaIdentifierPart", -1) == 0) {
4506                 addCategory(set.getAlias(), U_GC_L_MASK, status);
4507                 addCategory(set.getAlias(), U_GC_SC_MASK, status);
4508                 addCategory(set.getAlias(), U_GC_PC_MASK, status);
4509                 addCategory(set.getAlias(), U_GC_ND_MASK, status);
4510                 addCategory(set.getAlias(), U_GC_NL_MASK, status);
4511                 addCategory(set.getAlias(), U_GC_MC_MASK, status);
4512                 addCategory(set.getAlias(), U_GC_MN_MASK, status);
4513                 addIdentifierIgnorable(set.getAlias(), status);
4514             }
4515             else if (propName.compare(u"javaJavaIdentifierStart", -1) == 0) {
4516                 addCategory(set.getAlias(), U_GC_L_MASK, status);
4517                 addCategory(set.getAlias(), U_GC_NL_MASK, status);
4518                 addCategory(set.getAlias(), U_GC_SC_MASK, status);
4519                 addCategory(set.getAlias(), U_GC_PC_MASK, status);
4520             }
4521             else if (propName.compare(u"javaLetter", -1) == 0) {
4522                 addCategory(set.getAlias(), U_GC_L_MASK, status);
4523             }
4524             else if (propName.compare(u"javaLetterOrDigit", -1) == 0) {
4525                 addCategory(set.getAlias(), U_GC_L_MASK, status);
4526                 addCategory(set.getAlias(), U_GC_ND_MASK, status);
4527             }
4528             else if (propName.compare(u"javaLowerCase", -1) == 0) {
4529                 addCategory(set.getAlias(), U_GC_LL_MASK, status);
4530             }
4531             else if (propName.compare(u"javaMirrored", -1) == 0) {
4532                 set->applyIntPropertyValue(UCHAR_BIDI_MIRRORED, 1, status);
4533             }
4534             else if (propName.compare(u"javaSpaceChar", -1) == 0) {
4535                 addCategory(set.getAlias(), U_GC_Z_MASK, status);
4536             }
4537             else if (propName.compare(u"javaSupplementaryCodePoint", -1) == 0) {
4538                 set->add(0x10000, UnicodeSet::MAX_VALUE);
4539             }
4540             else if (propName.compare(u"javaTitleCase", -1) == 0) {
4541                 addCategory(set.getAlias(), U_GC_LT_MASK, status);
4542             }
4543             else if (propName.compare(u"javaUnicodeIdentifierStart", -1) == 0) {
4544                 addCategory(set.getAlias(), U_GC_L_MASK, status);
4545                 addCategory(set.getAlias(), U_GC_NL_MASK, status);
4546             }
4547             else if (propName.compare(u"javaUnicodeIdentifierPart", -1) == 0) {
4548                 addCategory(set.getAlias(), U_GC_L_MASK, status);
4549                 addCategory(set.getAlias(), U_GC_PC_MASK, status);
4550                 addCategory(set.getAlias(), U_GC_ND_MASK, status);
4551                 addCategory(set.getAlias(), U_GC_NL_MASK, status);
4552                 addCategory(set.getAlias(), U_GC_MC_MASK, status);
4553                 addCategory(set.getAlias(), U_GC_MN_MASK, status);
4554                 addIdentifierIgnorable(set.getAlias(), status);
4555             }
4556             else if (propName.compare(u"javaUpperCase", -1) == 0) {
4557                 addCategory(set.getAlias(), U_GC_LU_MASK, status);
4558             }
4559             else if (propName.compare(u"javaValidCodePoint", -1) == 0) {
4560                 set->add(0, UnicodeSet::MAX_VALUE);
4561             }
4562             else if (propName.compare(u"javaWhitespace", -1) == 0) {
4563                 addCategory(set.getAlias(), U_GC_Z_MASK, status);
4564                 set->removeAll(UnicodeSet().add(0xa0).add(0x2007).add(0x202f));
4565                 set->add(9, 0x0d).add(0x1c, 0x1f);
4566             } else {
4567                 status = U_REGEX_PROPERTY_SYNTAX;
4568             }
4569 
4570             if (U_SUCCESS(status) && !set->isEmpty() && (usetFlags & USET_CASE_INSENSITIVE)) {
4571                 set->closeOver(USET_CASE_INSENSITIVE);
4572             }
4573             break;
4574         }
4575 
4576         // Unrecognized property. ICU didn't like it as it was, and none of the Java compatibility
4577         // extensions matched it.
4578         status = U_REGEX_PROPERTY_SYNTAX;
4579     } while (false);   // End of do loop block. Code above breaks out of the block on success or hard failure.
4580 
4581     if (U_SUCCESS(status)) {
4582         // ICU 70 adds emoji properties of strings, but as long as Java does not say how to
4583         // deal with properties of strings and character classes with strings, we ignore them.
4584         // Just in case something downstream might stumble over the strings,
4585         // we remove them from the set.
4586         // Note that when we support strings, the complement of a property (as with \P)
4587         // should be implemented as .complement().removeAllStrings() (code point complement).
4588         set->removeAllStrings();
4589         U_ASSERT(set.isValid());
4590         if (negated) {
4591             set->complement();
4592         }
4593         return set.orphan();
4594     } else {
4595         if (status == U_ILLEGAL_ARGUMENT_ERROR) {
4596             status = U_REGEX_PROPERTY_SYNTAX;
4597         }
4598         error(status);
4599         return nullptr;
4600     }
4601 }
4602 
4603 
4604 //
4605 //  SetEval   Part of the evaluation of [set expressions].
4606 //            Perform any pending (stacked) operations with precedence
4607 //            equal or greater to that of the next operator encountered
4608 //            in the expression.
4609 //
setEval(int32_t nextOp)4610 void RegexCompile::setEval(int32_t nextOp) {
4611     UnicodeSet *rightOperand = nullptr;
4612     UnicodeSet *leftOperand  = nullptr;
4613     for (;;) {
4614         U_ASSERT(fSetOpStack.empty()==false);
4615         int32_t pendingSetOperation = fSetOpStack.peeki();
4616         if ((pendingSetOperation&0xffff0000) < (nextOp&0xffff0000)) {
4617             break;
4618         }
4619         fSetOpStack.popi();
4620         U_ASSERT(fSetStack.empty() == false);
4621         rightOperand = (UnicodeSet *)fSetStack.peek();
4622         // ICU 70 adds emoji properties of strings, but createSetForProperty() removes all strings
4623         // (see comments there).
4624         // We also do not yet support string literals in character classes,
4625         // so there should not be any strings.
4626         // Note that when we support strings, the complement of a set (as with ^ or \P)
4627         // should be implemented as .complement().removeAllStrings() (code point complement).
4628         U_ASSERT(!rightOperand->hasStrings());
4629         switch (pendingSetOperation) {
4630             case setNegation:
4631                 rightOperand->complement();
4632                 break;
4633             case setCaseClose:
4634                 // TODO: need a simple close function.  Ticket 6065
4635                 rightOperand->closeOver(USET_CASE_INSENSITIVE);
4636                 rightOperand->removeAllStrings();
4637                 break;
4638             case setDifference1:
4639             case setDifference2:
4640                 fSetStack.pop();
4641                 leftOperand = (UnicodeSet *)fSetStack.peek();
4642                 leftOperand->removeAll(*rightOperand);
4643                 delete rightOperand;
4644                 break;
4645             case setIntersection1:
4646             case setIntersection2:
4647                 fSetStack.pop();
4648                 leftOperand = (UnicodeSet *)fSetStack.peek();
4649                 leftOperand->retainAll(*rightOperand);
4650                 delete rightOperand;
4651                 break;
4652             case setUnion:
4653                 fSetStack.pop();
4654                 leftOperand = (UnicodeSet *)fSetStack.peek();
4655                 leftOperand->addAll(*rightOperand);
4656                 delete rightOperand;
4657                 break;
4658             default:
4659                 UPRV_UNREACHABLE_EXIT;
4660             }
4661         }
4662     }
4663 
setPushOp(int32_t op)4664 void RegexCompile::setPushOp(int32_t op) {
4665     setEval(op);
4666     fSetOpStack.push(op, *fStatus);
4667     LocalPointer<UnicodeSet> lpSet(new UnicodeSet(), *fStatus);
4668     fSetStack.push(lpSet.orphan(), *fStatus);
4669 }
4670 
4671 U_NAMESPACE_END
4672 #endif  // !UCONFIG_NO_REGULAR_EXPRESSIONS
4673 
4674