1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 // 4 // regexcmp.h 5 // 6 // Copyright (C) 2002-2016, International Business Machines Corporation and others. 7 // All Rights Reserved. 8 // 9 // This file contains declarations for the class RegexCompile 10 // 11 // This class is internal to the regular expression implementation. 12 // For the public Regular Expression API, see the file "unicode/regex.h" 13 // 14 15 16 #ifndef RBBISCAN_H 17 #define RBBISCAN_H 18 19 #include "unicode/utypes.h" 20 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 21 22 #include "unicode/parseerr.h" 23 #include "unicode/uniset.h" 24 #include "unicode/uobject.h" 25 #include "unicode/utext.h" 26 #include "uhash.h" 27 #include "uvector.h" 28 #include "uvectr32.h" 29 30 31 32 U_NAMESPACE_BEGIN 33 34 35 //-------------------------------------------------------------------------------- 36 // 37 // class RegexCompile Contains the regular expression compiler. 38 // 39 //-------------------------------------------------------------------------------- 40 struct RegexTableEl; 41 class RegexPattern; 42 43 44 class U_I18N_API RegexCompile : public UMemory { 45 public: 46 47 enum { 48 kStackSize = 100 // The size of the state stack for 49 }; // pattern parsing. Corresponds roughly 50 // to the depth of parentheses nesting 51 // that is allowed in the rules. 52 53 struct RegexPatternChar { 54 UChar32 fChar; 55 UBool fQuoted; 56 }; 57 58 RegexCompile(RegexPattern *rp, UErrorCode &e); 59 60 void compile(const UnicodeString &pat, UParseError &pp, UErrorCode &e); 61 void compile(UText *pat, UParseError &pp, UErrorCode &e); 62 63 64 virtual ~RegexCompile(); 65 66 void nextChar(RegexPatternChar &c); // Get the next char from the input stream. 67 68 static void cleanup(); // Memory cleanup 69 70 71 72 // Categories of parentheses in pattern. 73 // The category is saved in the compile-time parentheses stack frame, and 74 // determines the code to be generated when the matching close ) is encountered. 75 enum EParenClass { 76 plain = -1, // No special handling 77 capturing = -2, 78 atomic = -3, 79 lookAhead = -4, 80 negLookAhead = -5, 81 flags = -6, 82 lookBehind = -7, 83 lookBehindN = -8 84 }; 85 86 private: 87 88 89 UBool doParseActions(int32_t a); 90 void error(UErrorCode e); // error reporting convenience function. 91 92 UChar32 nextCharLL(); 93 UChar32 peekCharLL(); 94 UnicodeSet *scanProp(); 95 UnicodeSet *scanPosixProp(); 96 void handleCloseParen(); 97 int32_t blockTopLoc(UBool reserve); // Locate a position in the compiled pattern 98 // at the top of the just completed block 99 // or operation, and optionally ensure that 100 // there is space to add an opcode there. 101 void compileSet(UnicodeSet *theSet); // Generate the compiled pattern for 102 // a reference to a UnicodeSet. 103 void compileInterval(int32_t InitOp, // Generate the code for a {min,max} quantifier. 104 int32_t LoopOp); 105 UBool compileInlineInterval(); // Generate inline code for a {min,max} quantifier 106 void literalChar(UChar32 c); // Compile a literal char 107 void fixLiterals(UBool split=false); // Generate code for pending literal characters. 108 void insertOp(int32_t where); // Open up a slot for a new op in the 109 // generated code at the specified location. 110 void appendOp(int32_t op); // Append a new op to the compiled pattern. 111 void appendOp(int32_t type, int32_t val); // Build & append a new op to the compiled pattern. 112 int32_t buildOp(int32_t type, int32_t val); // Construct a new pcode instruction. 113 int32_t allocateData(int32_t size); // Allocate space in the matcher data area. 114 // Return index of the newly allocated data. 115 int32_t allocateStackData(int32_t size); // Allocate space in the match back-track stack frame. 116 // Return offset index in the frame. 117 int32_t minMatchLength(int32_t start, 118 int32_t end); 119 int32_t maxMatchLength(int32_t start, 120 int32_t end); 121 void matchStartType(); 122 void stripNOPs(); 123 124 void setEval(int32_t op); 125 void setPushOp(int32_t op); 126 UChar32 scanNamedChar(); 127 UnicodeSet *createSetForProperty(const UnicodeString &propName, UBool negated); 128 129 public: // Public for testing only. 130 static void U_EXPORT2 findCaseInsensitiveStarters(UChar32 c, UnicodeSet *starterChars); 131 private: 132 133 134 UErrorCode *fStatus; 135 RegexPattern *fRXPat; 136 UParseError *fParseErr; 137 138 // 139 // Data associated with low level character scanning 140 // 141 int64_t fScanIndex; // Index of current character being processed 142 // in the rule input string. 143 UBool fQuoteMode; // Scan is in a \Q...\E quoted region 144 UBool fInBackslashQuote; // Scan is between a '\' and the following char. 145 UBool fEOLComments; // When scan is just after '(?', inhibit #... to 146 // end of line comments, in favor of (?#...) comments. 147 int64_t fLineNum; // Line number in input file. 148 int64_t fCharNum; // Char position within the line. 149 UChar32 fLastChar; // Previous char, needed to count CR-LF 150 // as a single line, not two. 151 UChar32 fPeekChar; // Saved char, if we've scanned ahead. 152 153 154 RegexPatternChar fC; // Current char for parse state machine 155 // processing. 156 157 // 158 // Data for the state machine that parses the regular expression. 159 // 160 RegexTableEl **fStateTable; // State Transition Table for regex Rule 161 // parsing. index by p[state][char-class] 162 163 uint16_t fStack[kStackSize]; // State stack, holds state pushes 164 int32_t fStackPtr; // and pops as specified in the state 165 // transition rules. 166 167 // 168 // Data associated with the generation of the pcode for the match engine 169 // 170 int32_t fModeFlags; // Match Flags. (Case Insensitive, etc.) 171 // Always has high bit (31) set so that flag values 172 // on the paren stack are distinguished from relocatable 173 // pcode addresses. 174 int32_t fNewModeFlags; // New flags, while compiling (?i, holds state 175 // until last flag is scanned. 176 UBool fSetModeFlag; // true for (?ismx, false for (?-ismx 177 178 UnicodeString fLiteralChars; // Literal chars or strings from the pattern are accumulated here. 179 // Once completed, meaning that some non-literal pattern 180 // construct is encountered, the appropriate opcodes 181 // to match the literal will be generated, and this 182 // string will be cleared. 183 184 int64_t fPatternLength; // Length of the input pattern string. 185 186 UVector32 fParenStack; // parentheses stack. Each frame consists of 187 // the positions of compiled pattern operations 188 // needing fixup, followed by negative value. The 189 // first entry in each frame is the position of the 190 // spot reserved for use when a quantifier 191 // needs to add a SAVE at the start of a (block) 192 // The negative value (-1, -2,...) indicates 193 // the kind of paren that opened the frame. Some 194 // need special handling on close. 195 196 197 int32_t fMatchOpenParen; // The position in the compiled pattern 198 // of the slot reserved for a state save 199 // at the start of the most recently processed 200 // parenthesized block. Updated when processing 201 // a close to the location for the corresponding open. 202 203 int32_t fMatchCloseParen; // The position in the pattern of the first 204 // location after the most recently processed 205 // parenthesized block. 206 207 int32_t fIntervalLow; // {lower, upper} interval quantifier values. 208 int32_t fIntervalUpper; // Placed here temporarily, when pattern is 209 // initially scanned. Each new interval 210 // encountered overwrites these values. 211 // -1 for the upper interval value means none 212 // was specified (unlimited occurences.) 213 214 int64_t fNameStartPos; // Starting position of a \N{NAME} name in a 215 // pattern, valid while remainder of name is 216 // scanned. 217 218 UStack fSetStack; // Stack of UnicodeSets, used while evaluating 219 // (at compile time) set expressions within 220 // the pattern. 221 UStack fSetOpStack; // Stack of pending set operators (&&, --, union) 222 223 UChar32 fLastSetLiteral; // The last single code point added to a set. 224 // needed when "-y" is scanned, and we need 225 // to turn "x-y" into a range. 226 227 UnicodeString *fCaptureName; // Named Capture, the group name is built up 228 // in this string while being scanned. 229 }; 230 231 // Constant values to be pushed onto fSetOpStack while scanning & evalueating [set expressions] 232 // The high 16 bits are the operator precedence, and the low 16 are a code for the operation itself. 233 234 enum SetOperations { 235 setStart = 0 << 16 | 1, 236 setEnd = 1 << 16 | 2, 237 setNegation = 2 << 16 | 3, 238 setCaseClose = 2 << 16 | 9, 239 setDifference2 = 3 << 16 | 4, // '--' set difference operator 240 setIntersection2 = 3 << 16 | 5, // '&&' set intersection operator 241 setUnion = 4 << 16 | 6, // implicit union of adjacent items 242 setDifference1 = 4 << 16 | 7, // '-', single dash difference op, for compatibility with old UnicodeSet. 243 setIntersection1 = 4 << 16 | 8 // '&', single amp intersection op, for compatibility with old UnicodeSet. 244 }; 245 246 U_NAMESPACE_END 247 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS 248 #endif // RBBISCAN_H 249