• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2009 Apple Inc. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  * 1. Redistributions of source code must retain the above copyright
8  *    notice, this list of conditions and the following disclaimer.
9  * 2. Redistributions in binary form must reproduce the above copyright
10  *    notice, this list of conditions and the following disclaimer in the
11  *    documentation and/or other materials provided with the distribution.
12  *
13  * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
14  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
16  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
17  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
18  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
19  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
20  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
21  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
23  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24  */
25 
26 #ifndef RegexInterpreter_h
27 #define RegexInterpreter_h
28 
29 #include <wtf/Platform.h>
30 
31 #if ENABLE(YARR)
32 
33 #include <wtf/unicode/Unicode.h>
34 #include "RegexParser.h"
35 #include "RegexPattern.h"
36 
37 namespace JSC { namespace Yarr {
38 
39 class ByteDisjunction;
40 
41 struct ByteTerm {
42     enum Type {
43         TypeBodyAlternativeBegin,
44         TypeBodyAlternativeDisjunction,
45         TypeBodyAlternativeEnd,
46         TypeAlternativeBegin,
47         TypeAlternativeDisjunction,
48         TypeAlternativeEnd,
49         TypeSubpatternBegin,
50         TypeSubpatternEnd,
51         TypeAssertionBOL,
52         TypeAssertionEOL,
53         TypeAssertionWordBoundary,
54         TypePatternCharacterOnce,
55         TypePatternCharacterFixed,
56         TypePatternCharacterGreedy,
57         TypePatternCharacterNonGreedy,
58         TypePatternCasedCharacterOnce,
59         TypePatternCasedCharacterFixed,
60         TypePatternCasedCharacterGreedy,
61         TypePatternCasedCharacterNonGreedy,
62         TypeCharacterClass,
63         TypeBackReference,
64         TypeParenthesesSubpattern,
65         TypeParenthesesSubpatternOnceBegin,
66         TypeParenthesesSubpatternOnceEnd,
67         TypeParentheticalAssertionBegin,
68         TypeParentheticalAssertionEnd,
69         TypeCheckInput,
70     } type;
71     bool invertOrCapture;
72     union {
73         struct {
74             union {
75                 UChar patternCharacter;
76                 struct {
77                     UChar lo;
78                     UChar hi;
79                 } casedCharacter;
80                 CharacterClass* characterClass;
81                 unsigned subpatternId;
82             };
83             union {
84                 ByteDisjunction* parenthesesDisjunction;
85                 unsigned parenthesesWidth;
86             };
87             QuantifierType quantityType;
88             unsigned quantityCount;
89         } atom;
90         struct {
91             int next;
92             int end;
93         } alternative;
94         unsigned checkInputCount;
95     };
96     unsigned frameLocation;
97     int inputPosition;
98 
ByteTermByteTerm99     ByteTerm(UChar ch, int inputPos, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType)
100         : frameLocation(frameLocation)
101     {
102         switch (quantityType) {
103         case QuantifierFixedCount:
104             type = (quantityCount == 1) ? ByteTerm::TypePatternCharacterOnce : ByteTerm::TypePatternCharacterFixed;
105             break;
106         case QuantifierGreedy:
107             type = ByteTerm::TypePatternCharacterGreedy;
108             break;
109         case QuantifierNonGreedy:
110             type = ByteTerm::TypePatternCharacterNonGreedy;
111             break;
112         }
113 
114         atom.patternCharacter = ch;
115         atom.quantityType = quantityType;
116         atom.quantityCount = quantityCount;
117         inputPosition = inputPos;
118     }
119 
ByteTermByteTerm120     ByteTerm(UChar lo, UChar hi, int inputPos, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType)
121         : frameLocation(frameLocation)
122     {
123         switch (quantityType) {
124         case QuantifierFixedCount:
125             type = (quantityCount == 1) ? ByteTerm::TypePatternCasedCharacterOnce : ByteTerm::TypePatternCasedCharacterFixed;
126             break;
127         case QuantifierGreedy:
128             type = ByteTerm::TypePatternCasedCharacterGreedy;
129             break;
130         case QuantifierNonGreedy:
131             type = ByteTerm::TypePatternCasedCharacterNonGreedy;
132             break;
133         }
134 
135         atom.casedCharacter.lo = lo;
136         atom.casedCharacter.hi = hi;
137         atom.quantityType = quantityType;
138         atom.quantityCount = quantityCount;
139         inputPosition = inputPos;
140     }
141 
ByteTermByteTerm142     ByteTerm(CharacterClass* characterClass, bool invert, int inputPos)
143         : type(ByteTerm::TypeCharacterClass)
144         , invertOrCapture(invert)
145     {
146         atom.characterClass = characterClass;
147         atom.quantityType = QuantifierFixedCount;
148         atom.quantityCount = 1;
149         inputPosition = inputPos;
150     }
151 
ByteTermByteTerm152     ByteTerm(Type type, unsigned subpatternId, ByteDisjunction* parenthesesInfo, bool invertOrCapture, int inputPos)
153         : type(type)
154         , invertOrCapture(invertOrCapture)
155     {
156         atom.subpatternId = subpatternId;
157         atom.parenthesesDisjunction = parenthesesInfo;
158         atom.quantityType = QuantifierFixedCount;
159         atom.quantityCount = 1;
160         inputPosition = inputPos;
161     }
162 
163     ByteTerm(Type type, bool invert = false)
typeByteTerm164         : type(type)
165         , invertOrCapture(invert)
166     {
167         atom.quantityType = QuantifierFixedCount;
168         atom.quantityCount = 1;
169     }
170 
ByteTermByteTerm171     ByteTerm(Type type, unsigned subpatternId, bool invertOrCapture, int inputPos)
172         : type(type)
173         , invertOrCapture(invertOrCapture)
174     {
175         atom.subpatternId = subpatternId;
176         atom.quantityType = QuantifierFixedCount;
177         atom.quantityCount = 1;
178         inputPosition = inputPos;
179     }
180 
BOLByteTerm181     static ByteTerm BOL(int inputPos)
182     {
183         ByteTerm term(TypeAssertionBOL);
184         term.inputPosition = inputPos;
185         return term;
186     }
187 
CheckInputByteTerm188     static ByteTerm CheckInput(unsigned count)
189     {
190         ByteTerm term(TypeCheckInput);
191         term.checkInputCount = count;
192         return term;
193     }
194 
EOLByteTerm195     static ByteTerm EOL(int inputPos)
196     {
197         ByteTerm term(TypeAssertionEOL);
198         term.inputPosition = inputPos;
199         return term;
200     }
201 
WordBoundaryByteTerm202     static ByteTerm WordBoundary(bool invert, int inputPos)
203     {
204         ByteTerm term(TypeAssertionWordBoundary, invert);
205         term.inputPosition = inputPos;
206         return term;
207     }
208 
BackReferenceByteTerm209     static ByteTerm BackReference(unsigned subpatternId, int inputPos)
210     {
211         return ByteTerm(TypeBackReference, subpatternId, false, inputPos);
212     }
213 
BodyAlternativeBeginByteTerm214     static ByteTerm BodyAlternativeBegin()
215     {
216         ByteTerm term(TypeBodyAlternativeBegin);
217         term.alternative.next = 0;
218         term.alternative.end = 0;
219         return term;
220     }
221 
BodyAlternativeDisjunctionByteTerm222     static ByteTerm BodyAlternativeDisjunction()
223     {
224         ByteTerm term(TypeBodyAlternativeDisjunction);
225         term.alternative.next = 0;
226         term.alternative.end = 0;
227         return term;
228     }
229 
BodyAlternativeEndByteTerm230     static ByteTerm BodyAlternativeEnd()
231     {
232         ByteTerm term(TypeBodyAlternativeEnd);
233         term.alternative.next = 0;
234         term.alternative.end = 0;
235         return term;
236     }
237 
AlternativeBeginByteTerm238     static ByteTerm AlternativeBegin()
239     {
240         ByteTerm term(TypeAlternativeBegin);
241         term.alternative.next = 0;
242         term.alternative.end = 0;
243         return term;
244     }
245 
AlternativeDisjunctionByteTerm246     static ByteTerm AlternativeDisjunction()
247     {
248         ByteTerm term(TypeAlternativeDisjunction);
249         term.alternative.next = 0;
250         term.alternative.end = 0;
251         return term;
252     }
253 
AlternativeEndByteTerm254     static ByteTerm AlternativeEnd()
255     {
256         ByteTerm term(TypeAlternativeEnd);
257         term.alternative.next = 0;
258         term.alternative.end = 0;
259         return term;
260     }
261 
SubpatternBeginByteTerm262     static ByteTerm SubpatternBegin()
263     {
264         return ByteTerm(TypeSubpatternBegin);
265     }
266 
SubpatternEndByteTerm267     static ByteTerm SubpatternEnd()
268     {
269         return ByteTerm(TypeSubpatternEnd);
270     }
271 
invertByteTerm272     bool invert()
273     {
274         return invertOrCapture;
275     }
276 
captureByteTerm277     bool capture()
278     {
279         return invertOrCapture;
280     }
281 };
282 
283 class ByteDisjunction {
284 public:
ByteDisjunction(unsigned numSubpatterns,unsigned frameSize)285     ByteDisjunction(unsigned numSubpatterns, unsigned frameSize)
286         : m_numSubpatterns(numSubpatterns)
287         , m_frameSize(frameSize)
288     {
289     }
290 
291     Vector<ByteTerm> terms;
292     unsigned m_numSubpatterns;
293     unsigned m_frameSize;
294 };
295 
296 struct BytecodePattern {
BytecodePatternBytecodePattern297     BytecodePattern(ByteDisjunction* body, Vector<ByteDisjunction*> allParenthesesInfo, RegexPattern& pattern)
298         : m_body(body)
299         , m_ignoreCase(pattern.m_ignoreCase)
300         , m_multiline(pattern.m_multiline)
301     {
302         newlineCharacterClass = pattern.newlineCharacterClass();
303         wordcharCharacterClass = pattern.wordcharCharacterClass();
304 
305         m_allParenthesesInfo.append(allParenthesesInfo);
306         m_userCharacterClasses.append(pattern.m_userCharacterClasses);
307         // 'Steal' the RegexPattern's CharacterClasses!  We clear its
308         // array, so that it won't delete them on destruction.  We'll
309         // take responsibility for that.
310         pattern.m_userCharacterClasses.clear();
311     }
312 
~BytecodePatternBytecodePattern313     ~BytecodePattern()
314     {
315         deleteAllValues(m_allParenthesesInfo);
316         deleteAllValues(m_userCharacterClasses);
317     }
318 
319     OwnPtr<ByteDisjunction> m_body;
320     bool m_ignoreCase;
321     bool m_multiline;
322 
323     CharacterClass* newlineCharacterClass;
324     CharacterClass* wordcharCharacterClass;
325 private:
326     Vector<ByteDisjunction*> m_allParenthesesInfo;
327     Vector<CharacterClass*> m_userCharacterClasses;
328 };
329 
330 BytecodePattern* byteCompileRegex(const UString& pattern, unsigned& numSubpatterns, const char*& error, bool ignoreCase = false, bool multiline = false);
331 int interpretRegex(BytecodePattern* v_regex, const UChar* input, unsigned start, unsigned length, int* output);
332 
333 } } // namespace JSC::Yarr
334 
335 #endif
336 
337 #endif // RegexInterpreter_h
338