1 // 2 // regexcmp.h 3 // 4 // Copyright (C) 2002-2007, International Business Machines Corporation and others. 5 // All Rights Reserved. 6 // 7 // This file contains declarations for the class RegexCompile 8 // 9 // This class is internal to the regular expression implementation. 10 // For the public Regular Expression API, see the file "unicode/regex.h" 11 // 12 13 14 #ifndef RBBISCAN_H 15 #define RBBISCAN_H 16 17 #include "unicode/utypes.h" 18 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 19 20 #include "unicode/uobject.h" 21 #include "unicode/uniset.h" 22 #include "unicode/parseerr.h" 23 #include "uhash.h" 24 #include "uvector.h" 25 26 27 28 U_NAMESPACE_BEGIN 29 30 31 //-------------------------------------------------------------------------------- 32 // 33 // class RegexCompile Contains the regular expression compiler. 34 // 35 //-------------------------------------------------------------------------------- 36 static const int kStackSize = 100; // The size of the state stack for 37 // pattern parsing. Corresponds roughly 38 // to the depth of parentheses nesting 39 // that is allowed in the rules. 40 41 struct RegexTableEl; 42 class RegexPattern; 43 44 45 class RegexCompile : public UMemory { 46 public: 47 48 struct RegexPatternChar { 49 UChar32 fChar; 50 UBool fQuoted; 51 }; 52 53 RegexCompile(RegexPattern *rp, UErrorCode &e); 54 55 void compile(const UnicodeString &pat, UParseError &pp, UErrorCode &e); 56 57 58 virtual ~RegexCompile(); 59 60 void nextChar(RegexPatternChar &c); // Get the next char from the input stream. 61 62 static void cleanup(); // Memory cleanup 63 64 65 66 // Categories of parentheses in pattern. 67 // The category is saved in the compile-time parentheses stack frame, and 68 // determines the code to be generated when the matching close ) is encountered. 69 enum EParenClass { 70 plain = -1, // No special handling 71 capturing = -2, 72 atomic = -3, 73 lookAhead = -4, 74 negLookAhead = -5, 75 flags = -6, 76 lookBehind = -7, 77 lookBehindN = -8 78 }; 79 80 private: 81 82 83 UBool doParseActions(int32_t a); 84 void error(UErrorCode e); // error reporting convenience function. 85 86 UChar32 nextCharLL(); 87 UChar32 peekCharLL(); 88 UnicodeSet *scanProp(); 89 UnicodeSet *scanPosixProp(); 90 void handleCloseParen(); 91 int32_t blockTopLoc(UBool reserve); // Locate a position in the compiled pattern 92 // at the top of the just completed block 93 // or operation, and optionally ensure that 94 // there is space to add an opcode there. 95 void compileSet(UnicodeSet *theSet); // Generate the compiled pattern for 96 // a reference to a UnicodeSet. 97 void compileInterval(int32_t InitOp, // Generate the code for a {min,max} quantifier. 98 int32_t LoopOp); 99 UBool compileInlineInterval(); // Generate inline code for a {min,max} quantifier 100 void literalChar(UChar32 c); // Compile a literal char 101 void fixLiterals(UBool split=FALSE); // Fix literal strings. 102 void insertOp(int32_t where); // Open up a slot for a new op in the 103 // generated code at the specified location. 104 void emitONE_CHAR(UChar32 c); // EMit a ONE_CHAR op into the compiled code, 105 // taking case mode into account. 106 int32_t minMatchLength(int32_t start, 107 int32_t end); 108 int32_t maxMatchLength(int32_t start, 109 int32_t end); 110 void matchStartType(); 111 void stripNOPs(); 112 113 void setEval(int32_t op); 114 void setPushOp(int32_t op); 115 UChar32 scanNamedChar(); 116 UnicodeSet *createSetForProperty(const UnicodeString &propName, UBool negated); 117 118 119 UErrorCode *fStatus; 120 RegexPattern *fRXPat; 121 UParseError *fParseErr; 122 123 // 124 // Data associated with low level character scanning 125 // 126 int32_t fScanIndex; // Index of current character being processed 127 // in the rule input string. 128 int32_t fNextIndex; // Index of the next character, which 129 // is the first character not yet scanned. 130 UBool fQuoteMode; // Scan is in a \Q...\E quoted region 131 UBool fInBackslashQuote; // Scan is between a '\' and the following char. 132 UBool fEOLComments; // When scan is just after '(?', inhibit #... to 133 // end of line comments, in favor of (?#...) comments. 134 int32_t fLineNum; // Line number in input file. 135 int32_t fCharNum; // Char position within the line. 136 UChar32 fLastChar; // Previous char, needed to count CR-LF 137 // as a single line, not two. 138 UChar32 fPeekChar; // Saved char, if we've scanned ahead. 139 140 141 RegexPatternChar fC; // Current char for parse state machine 142 // processing. 143 144 // 145 // Data for the state machine that parses the regular expression. 146 // 147 RegexTableEl **fStateTable; // State Transition Table for regex Rule 148 // parsing. index by p[state][char-class] 149 150 uint16_t fStack[kStackSize]; // State stack, holds state pushes 151 int32_t fStackPtr; // and pops as specified in the state 152 // transition rules. 153 154 // 155 // Data associated with the generation of the pcode for the match engine 156 // 157 int32_t fModeFlags; // Match Flags. (Case Insensitive, etc.) 158 // Always has high bit (31) set so that flag values 159 // on the paren stack are distinguished from relocatable 160 // pcode addresses. 161 int32_t fNewModeFlags; // New flags, while compiling (?i, holds state 162 // until last flag is scanned. 163 UBool fSetModeFlag; // true for (?ismx, false for (?-ismx 164 165 166 int32_t fStringOpStart; // While a literal string is being scanned 167 // holds the start index within RegexPattern. 168 // fLiteralText where the string is being stored. 169 170 int32_t fPatternLength; // Length of the input pattern string. 171 172 UVector32 fParenStack; // parentheses stack. Each frame consists of 173 // the positions of compiled pattern operations 174 // needing fixup, followed by negative value. The 175 // first entry in each frame is the position of the 176 // spot reserved for use when a quantifier 177 // needs to add a SAVE at the start of a (block) 178 // The negative value (-1, -2,...) indicates 179 // the kind of paren that opened the frame. Some 180 // need special handling on close. 181 182 183 int32_t fMatchOpenParen; // The position in the compiled pattern 184 // of the slot reserved for a state save 185 // at the start of the most recently processed 186 // parenthesized block. 187 int32_t fMatchCloseParen; // The position in the pattern of the first 188 // location after the most recently processed 189 // parenthesized block. 190 191 int32_t fIntervalLow; // {lower, upper} interval quantifier values. 192 int32_t fIntervalUpper; // Placed here temporarily, when pattern is 193 // initially scanned. Each new interval 194 // encountered overwrites these values. 195 // -1 for the upper interval value means none 196 // was specified (unlimited occurences.) 197 198 int32_t fNameStartPos; // Starting position of a \N{NAME} name in a 199 // pattern, valid while remainder of name is 200 // scanned. 201 202 UStack fSetStack; // Stack of UnicodeSets, used while evaluating 203 // (at compile time) set expressions within 204 // the pattern. 205 UStack fSetOpStack; // Stack of pending set operators (&&, --, union) 206 207 UChar32 fLastSetLiteral; // The last single code point added to a set. 208 // needed when "-y" is scanned, and we need 209 // to turn "x-y" into a range. 210 211 }; 212 213 // Constant values to be pushed onto fSetOpStack while scanning & evalueating [set expressions] 214 // The high 16 bits are the operator precedence, and the low 16 are a code for the operation itself. 215 216 enum SetOperations { 217 setStart = 0 << 16 | 1, 218 setEnd = 1 << 16 | 2, 219 setNegation = 2 << 16 | 3, 220 setCaseClose = 2 << 16 | 9, 221 setDifference2 = 3 << 16 | 4, // '--' set difference operator 222 setIntersection2 = 3 << 16 | 5, // '&&' set intersection operator 223 setUnion = 4 << 16 | 6, // implicit union of adjacent items 224 setDifference1 = 4 << 16 | 7, // '-', single dash difference op, for compatibility with old UnicodeSet. 225 setIntersection1 = 4 << 16 | 8 // '&', single amp intersection op, for compatibility with old UnicodeSet. 226 }; 227 228 U_NAMESPACE_END 229 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS 230 #endif // RBBISCAN_H 231