1 /* 2 * Copyright (C) 2009 Apple Inc. All rights reserved. 3 * 4 * Redistribution and use in source and binary forms, with or without 5 * modification, are permitted provided that the following conditions 6 * are met: 7 * 1. Redistributions of source code must retain the above copyright 8 * notice, this list of conditions and the following disclaimer. 9 * 2. Redistributions in binary form must reproduce the above copyright 10 * notice, this list of conditions and the following disclaimer in the 11 * documentation and/or other materials provided with the distribution. 12 * 13 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY 14 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 16 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR 17 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 18 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 19 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 20 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 21 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 23 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 24 */ 25 26 #ifndef RegexInterpreter_h 27 #define RegexInterpreter_h 28 29 #include <wtf/Platform.h> 30 31 #if ENABLE(YARR) 32 33 #include <wtf/unicode/Unicode.h> 34 #include "RegexParser.h" 35 #include "RegexPattern.h" 36 37 namespace JSC { namespace Yarr { 38 39 class ByteDisjunction; 40 41 struct ByteTerm { 42 enum Type { 43 TypeBodyAlternativeBegin, 44 TypeBodyAlternativeDisjunction, 45 TypeBodyAlternativeEnd, 46 TypeAlternativeBegin, 47 TypeAlternativeDisjunction, 48 TypeAlternativeEnd, 49 TypeSubpatternBegin, 50 TypeSubpatternEnd, 51 TypeAssertionBOL, 52 TypeAssertionEOL, 53 TypeAssertionWordBoundary, 54 TypePatternCharacterOnce, 55 TypePatternCharacterFixed, 56 TypePatternCharacterGreedy, 57 TypePatternCharacterNonGreedy, 58 TypePatternCasedCharacterOnce, 59 TypePatternCasedCharacterFixed, 60 TypePatternCasedCharacterGreedy, 61 TypePatternCasedCharacterNonGreedy, 62 TypeCharacterClass, 63 TypeBackReference, 64 TypeParenthesesSubpattern, 65 TypeParenthesesSubpatternOnceBegin, 66 TypeParenthesesSubpatternOnceEnd, 67 TypeParentheticalAssertionBegin, 68 TypeParentheticalAssertionEnd, 69 TypeCheckInput, 70 } type; 71 bool invertOrCapture; 72 union { 73 struct { 74 union { 75 UChar patternCharacter; 76 struct { 77 UChar lo; 78 UChar hi; 79 } casedCharacter; 80 CharacterClass* characterClass; 81 unsigned subpatternId; 82 }; 83 union { 84 ByteDisjunction* parenthesesDisjunction; 85 unsigned parenthesesWidth; 86 }; 87 QuantifierType quantityType; 88 unsigned quantityCount; 89 } atom; 90 struct { 91 int next; 92 int end; 93 } alternative; 94 unsigned checkInputCount; 95 }; 96 unsigned frameLocation; 97 int inputPosition; 98 ByteTermByteTerm99 ByteTerm(UChar ch, int inputPos, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType) 100 : frameLocation(frameLocation) 101 { 102 switch (quantityType) { 103 case QuantifierFixedCount: 104 type = (quantityCount == 1) ? ByteTerm::TypePatternCharacterOnce : ByteTerm::TypePatternCharacterFixed; 105 break; 106 case QuantifierGreedy: 107 type = ByteTerm::TypePatternCharacterGreedy; 108 break; 109 case QuantifierNonGreedy: 110 type = ByteTerm::TypePatternCharacterNonGreedy; 111 break; 112 } 113 114 atom.patternCharacter = ch; 115 atom.quantityType = quantityType; 116 atom.quantityCount = quantityCount; 117 inputPosition = inputPos; 118 } 119 ByteTermByteTerm120 ByteTerm(UChar lo, UChar hi, int inputPos, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType) 121 : frameLocation(frameLocation) 122 { 123 switch (quantityType) { 124 case QuantifierFixedCount: 125 type = (quantityCount == 1) ? ByteTerm::TypePatternCasedCharacterOnce : ByteTerm::TypePatternCasedCharacterFixed; 126 break; 127 case QuantifierGreedy: 128 type = ByteTerm::TypePatternCasedCharacterGreedy; 129 break; 130 case QuantifierNonGreedy: 131 type = ByteTerm::TypePatternCasedCharacterNonGreedy; 132 break; 133 } 134 135 atom.casedCharacter.lo = lo; 136 atom.casedCharacter.hi = hi; 137 atom.quantityType = quantityType; 138 atom.quantityCount = quantityCount; 139 inputPosition = inputPos; 140 } 141 ByteTermByteTerm142 ByteTerm(CharacterClass* characterClass, bool invert, int inputPos) 143 : type(ByteTerm::TypeCharacterClass) 144 , invertOrCapture(invert) 145 { 146 atom.characterClass = characterClass; 147 atom.quantityType = QuantifierFixedCount; 148 atom.quantityCount = 1; 149 inputPosition = inputPos; 150 } 151 ByteTermByteTerm152 ByteTerm(Type type, unsigned subpatternId, ByteDisjunction* parenthesesInfo, bool invertOrCapture, int inputPos) 153 : type(type) 154 , invertOrCapture(invertOrCapture) 155 { 156 atom.subpatternId = subpatternId; 157 atom.parenthesesDisjunction = parenthesesInfo; 158 atom.quantityType = QuantifierFixedCount; 159 atom.quantityCount = 1; 160 inputPosition = inputPos; 161 } 162 163 ByteTerm(Type type, bool invert = false) typeByteTerm164 : type(type) 165 , invertOrCapture(invert) 166 { 167 atom.quantityType = QuantifierFixedCount; 168 atom.quantityCount = 1; 169 } 170 ByteTermByteTerm171 ByteTerm(Type type, unsigned subpatternId, bool invertOrCapture, int inputPos) 172 : type(type) 173 , invertOrCapture(invertOrCapture) 174 { 175 atom.subpatternId = subpatternId; 176 atom.quantityType = QuantifierFixedCount; 177 atom.quantityCount = 1; 178 inputPosition = inputPos; 179 } 180 BOLByteTerm181 static ByteTerm BOL(int inputPos) 182 { 183 ByteTerm term(TypeAssertionBOL); 184 term.inputPosition = inputPos; 185 return term; 186 } 187 CheckInputByteTerm188 static ByteTerm CheckInput(unsigned count) 189 { 190 ByteTerm term(TypeCheckInput); 191 term.checkInputCount = count; 192 return term; 193 } 194 EOLByteTerm195 static ByteTerm EOL(int inputPos) 196 { 197 ByteTerm term(TypeAssertionEOL); 198 term.inputPosition = inputPos; 199 return term; 200 } 201 WordBoundaryByteTerm202 static ByteTerm WordBoundary(bool invert, int inputPos) 203 { 204 ByteTerm term(TypeAssertionWordBoundary, invert); 205 term.inputPosition = inputPos; 206 return term; 207 } 208 BackReferenceByteTerm209 static ByteTerm BackReference(unsigned subpatternId, int inputPos) 210 { 211 return ByteTerm(TypeBackReference, subpatternId, false, inputPos); 212 } 213 BodyAlternativeBeginByteTerm214 static ByteTerm BodyAlternativeBegin() 215 { 216 ByteTerm term(TypeBodyAlternativeBegin); 217 term.alternative.next = 0; 218 term.alternative.end = 0; 219 return term; 220 } 221 BodyAlternativeDisjunctionByteTerm222 static ByteTerm BodyAlternativeDisjunction() 223 { 224 ByteTerm term(TypeBodyAlternativeDisjunction); 225 term.alternative.next = 0; 226 term.alternative.end = 0; 227 return term; 228 } 229 BodyAlternativeEndByteTerm230 static ByteTerm BodyAlternativeEnd() 231 { 232 ByteTerm term(TypeBodyAlternativeEnd); 233 term.alternative.next = 0; 234 term.alternative.end = 0; 235 return term; 236 } 237 AlternativeBeginByteTerm238 static ByteTerm AlternativeBegin() 239 { 240 ByteTerm term(TypeAlternativeBegin); 241 term.alternative.next = 0; 242 term.alternative.end = 0; 243 return term; 244 } 245 AlternativeDisjunctionByteTerm246 static ByteTerm AlternativeDisjunction() 247 { 248 ByteTerm term(TypeAlternativeDisjunction); 249 term.alternative.next = 0; 250 term.alternative.end = 0; 251 return term; 252 } 253 AlternativeEndByteTerm254 static ByteTerm AlternativeEnd() 255 { 256 ByteTerm term(TypeAlternativeEnd); 257 term.alternative.next = 0; 258 term.alternative.end = 0; 259 return term; 260 } 261 SubpatternBeginByteTerm262 static ByteTerm SubpatternBegin() 263 { 264 return ByteTerm(TypeSubpatternBegin); 265 } 266 SubpatternEndByteTerm267 static ByteTerm SubpatternEnd() 268 { 269 return ByteTerm(TypeSubpatternEnd); 270 } 271 invertByteTerm272 bool invert() 273 { 274 return invertOrCapture; 275 } 276 captureByteTerm277 bool capture() 278 { 279 return invertOrCapture; 280 } 281 }; 282 283 class ByteDisjunction { 284 public: ByteDisjunction(unsigned numSubpatterns,unsigned frameSize)285 ByteDisjunction(unsigned numSubpatterns, unsigned frameSize) 286 : m_numSubpatterns(numSubpatterns) 287 , m_frameSize(frameSize) 288 { 289 } 290 291 Vector<ByteTerm> terms; 292 unsigned m_numSubpatterns; 293 unsigned m_frameSize; 294 }; 295 296 struct BytecodePattern { BytecodePatternBytecodePattern297 BytecodePattern(ByteDisjunction* body, Vector<ByteDisjunction*> allParenthesesInfo, RegexPattern& pattern) 298 : m_body(body) 299 , m_ignoreCase(pattern.m_ignoreCase) 300 , m_multiline(pattern.m_multiline) 301 { 302 newlineCharacterClass = pattern.newlineCharacterClass(); 303 wordcharCharacterClass = pattern.wordcharCharacterClass(); 304 305 m_allParenthesesInfo.append(allParenthesesInfo); 306 m_userCharacterClasses.append(pattern.m_userCharacterClasses); 307 // 'Steal' the RegexPattern's CharacterClasses! We clear its 308 // array, so that it won't delete them on destruction. We'll 309 // take responsibility for that. 310 pattern.m_userCharacterClasses.clear(); 311 } 312 ~BytecodePatternBytecodePattern313 ~BytecodePattern() 314 { 315 deleteAllValues(m_allParenthesesInfo); 316 deleteAllValues(m_userCharacterClasses); 317 } 318 319 OwnPtr<ByteDisjunction> m_body; 320 bool m_ignoreCase; 321 bool m_multiline; 322 323 CharacterClass* newlineCharacterClass; 324 CharacterClass* wordcharCharacterClass; 325 private: 326 Vector<ByteDisjunction*> m_allParenthesesInfo; 327 Vector<CharacterClass*> m_userCharacterClasses; 328 }; 329 330 BytecodePattern* byteCompileRegex(const UString& pattern, unsigned& numSubpatterns, const char*& error, bool ignoreCase = false, bool multiline = false); 331 int interpretRegex(BytecodePattern* v_regex, const UChar* input, unsigned start, unsigned length, int* output); 332 333 } } // namespace JSC::Yarr 334 335 #endif 336 337 #endif // RegexInterpreter_h 338