• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2008 Apple Inc. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  * 1. Redistributions of source code must retain the above copyright
8  *    notice, this list of conditions and the following disclaimer.
9  * 2. Redistributions in binary form must reproduce the above copyright
10  *    notice, this list of conditions and the following disclaimer in the
11  *    documentation and/or other materials provided with the distribution.
12  *
13  * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
14  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
16  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
17  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
18  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
19  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
20  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
21  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
23  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24  */
25 
26 #include "config.h"
27 #include "WRECParser.h"
28 
29 #if ENABLE(WREC)
30 
31 #include "CharacterClassConstructor.h"
32 #include "WRECFunctors.h"
33 
34 using namespace WTF;
35 
36 namespace JSC { namespace WREC {
37 
38 // These error messages match the error messages used by PCRE.
39 const char* Parser::QuantifierOutOfOrder = "numbers out of order in {} quantifier";
40 const char* Parser::QuantifierWithoutAtom = "nothing to repeat";
41 const char* Parser::ParenthesesUnmatched = "unmatched parentheses";
42 const char* Parser::ParenthesesTypeInvalid = "unrecognized character after (?";
43 const char* Parser::ParenthesesNotSupported = ""; // Not a user-visible syntax error -- just signals a syntax that WREC doesn't support yet.
44 const char* Parser::CharacterClassUnmatched = "missing terminating ] for character class";
45 const char* Parser::CharacterClassOutOfOrder = "range out of order in character class";
46 const char* Parser::EscapeUnterminated = "\\ at end of pattern";
47 
48 class PatternCharacterSequence {
49 typedef Generator::JumpList JumpList;
50 
51 public:
PatternCharacterSequence(Generator & generator,JumpList & failures)52     PatternCharacterSequence(Generator& generator, JumpList& failures)
53         : m_generator(generator)
54         , m_failures(failures)
55     {
56     }
57 
size()58     size_t size() { return m_sequence.size(); }
59 
append(int ch)60     void append(int ch)
61     {
62         m_sequence.append(ch);
63     }
64 
flush()65     void flush()
66     {
67         if (!m_sequence.size())
68             return;
69 
70         m_generator.generatePatternCharacterSequence(m_failures, m_sequence.begin(), m_sequence.size());
71         m_sequence.clear();
72     }
73 
flush(const Quantifier & quantifier)74     void flush(const Quantifier& quantifier)
75     {
76         if (!m_sequence.size())
77             return;
78 
79         m_generator.generatePatternCharacterSequence(m_failures, m_sequence.begin(), m_sequence.size() - 1);
80 
81         switch (quantifier.type) {
82         case Quantifier::None:
83         case Quantifier::Error:
84             ASSERT_NOT_REACHED();
85             break;
86 
87         case Quantifier::Greedy: {
88             GeneratePatternCharacterFunctor functor(m_sequence.last());
89             m_generator.generateGreedyQuantifier(m_failures, functor, quantifier.min, quantifier.max);
90             break;
91         }
92 
93         case Quantifier::NonGreedy: {
94             GeneratePatternCharacterFunctor functor(m_sequence.last());
95             m_generator.generateNonGreedyQuantifier(m_failures, functor, quantifier.min, quantifier.max);
96             break;
97         }
98         }
99 
100         m_sequence.clear();
101     }
102 
103 private:
104     Generator& m_generator;
105     JumpList& m_failures;
106     Vector<int, 8> m_sequence;
107 };
108 
consumeGreedyQuantifier()109 ALWAYS_INLINE Quantifier Parser::consumeGreedyQuantifier()
110 {
111     switch (peek()) {
112         case '?':
113             consume();
114             return Quantifier(Quantifier::Greedy, 0, 1);
115 
116         case '*':
117             consume();
118             return Quantifier(Quantifier::Greedy, 0);
119 
120         case '+':
121             consume();
122             return Quantifier(Quantifier::Greedy, 1);
123 
124         case '{': {
125             SavedState state(*this);
126             consume();
127 
128             // Accept: {n}, {n,}, {n,m}.
129             // Reject: {n,m} where n > m.
130             // Ignore: Anything else, such as {n, m}.
131 
132             if (!peekIsDigit()) {
133                 state.restore();
134                 return Quantifier();
135             }
136 
137             unsigned min = consumeNumber();
138             unsigned max = min;
139 
140             if (peek() == ',') {
141                 consume();
142                 max = peekIsDigit() ? consumeNumber() : Quantifier::Infinity;
143             }
144 
145             if (peek() != '}') {
146                 state.restore();
147                 return Quantifier();
148             }
149             consume();
150 
151             if (min > max) {
152                 setError(QuantifierOutOfOrder);
153                 return Quantifier(Quantifier::Error);
154             }
155 
156             return Quantifier(Quantifier::Greedy, min, max);
157          }
158 
159          default:
160             return Quantifier(); // No quantifier.
161     }
162 }
163 
consumeQuantifier()164 Quantifier Parser::consumeQuantifier()
165 {
166     Quantifier q = consumeGreedyQuantifier();
167 
168     if ((q.type == Quantifier::Greedy) && (peek() == '?')) {
169         consume();
170         q.type = Quantifier::NonGreedy;
171     }
172 
173     return q;
174 }
175 
parseCharacterClassQuantifier(JumpList & failures,const CharacterClass & charClass,bool invert)176 bool Parser::parseCharacterClassQuantifier(JumpList& failures, const CharacterClass& charClass, bool invert)
177 {
178     Quantifier q = consumeQuantifier();
179 
180     switch (q.type) {
181     case Quantifier::None: {
182         m_generator.generateCharacterClass(failures, charClass, invert);
183         break;
184     }
185 
186     case Quantifier::Greedy: {
187         GenerateCharacterClassFunctor functor(&charClass, invert);
188         m_generator.generateGreedyQuantifier(failures, functor, q.min, q.max);
189         break;
190     }
191 
192     case Quantifier::NonGreedy: {
193         GenerateCharacterClassFunctor functor(&charClass, invert);
194         m_generator.generateNonGreedyQuantifier(failures, functor, q.min, q.max);
195         break;
196     }
197 
198     case Quantifier::Error:
199         return false;
200     }
201 
202     return true;
203 }
204 
parseBackreferenceQuantifier(JumpList & failures,unsigned subpatternId)205 bool Parser::parseBackreferenceQuantifier(JumpList& failures, unsigned subpatternId)
206 {
207     Quantifier q = consumeQuantifier();
208 
209     switch (q.type) {
210     case Quantifier::None: {
211         m_generator.generateBackreference(failures, subpatternId);
212         break;
213     }
214 
215     case Quantifier::Greedy:
216     case Quantifier::NonGreedy:
217         m_generator.generateBackreferenceQuantifier(failures, q.type, subpatternId, q.min, q.max);
218         return true;
219 
220     case Quantifier::Error:
221         return false;
222     }
223 
224     return true;
225 }
226 
parseParentheses(JumpList & failures)227 bool Parser::parseParentheses(JumpList& failures)
228 {
229     ParenthesesType type = consumeParenthesesType();
230 
231     // FIXME: WREC originally failed to backtrack correctly in cases such as
232     // "c".match(/(.*)c/). Now, most parentheses handling is disabled. For
233     // unsupported parentheses, we fall back on PCRE.
234 
235     switch (type) {
236         case Generator::Assertion: {
237             m_generator.generateParenthesesAssertion(failures);
238 
239             if (consume() != ')') {
240                 setError(ParenthesesUnmatched);
241                 return false;
242             }
243 
244             Quantifier quantifier = consumeQuantifier();
245             if (quantifier.type != Quantifier::None && quantifier.min == 0) {
246                 setError(ParenthesesNotSupported);
247                 return false;
248             }
249 
250             return true;
251         }
252         case Generator::InvertedAssertion: {
253             m_generator.generateParenthesesInvertedAssertion(failures);
254 
255             if (consume() != ')') {
256                 setError(ParenthesesUnmatched);
257                 return false;
258             }
259 
260             Quantifier quantifier = consumeQuantifier();
261             if (quantifier.type != Quantifier::None && quantifier.min == 0) {
262                 setError(ParenthesesNotSupported);
263                 return false;
264             }
265 
266             return true;
267         }
268         default:
269             setError(ParenthesesNotSupported);
270             return false;
271     }
272 }
273 
parseCharacterClass(JumpList & failures)274 bool Parser::parseCharacterClass(JumpList& failures)
275 {
276     bool invert = false;
277     if (peek() == '^') {
278         consume();
279         invert = true;
280     }
281 
282     CharacterClassConstructor constructor(m_ignoreCase);
283 
284     int ch;
285     while ((ch = peek()) != ']') {
286         switch (ch) {
287         case EndOfPattern:
288             setError(CharacterClassUnmatched);
289             return false;
290 
291         case '\\': {
292             consume();
293             Escape escape = consumeEscape(true);
294 
295             switch (escape.type()) {
296                 case Escape::PatternCharacter: {
297                     int character = PatternCharacterEscape::cast(escape).character();
298                     if (character == '-')
299                         constructor.flushBeforeEscapedHyphen();
300                     constructor.put(character);
301                     break;
302                 }
303                 case Escape::CharacterClass: {
304                     const CharacterClassEscape& characterClassEscape = CharacterClassEscape::cast(escape);
305                     ASSERT(!characterClassEscape.invert());
306                     constructor.append(characterClassEscape.characterClass());
307                     break;
308                 }
309                 case Escape::Error:
310                     return false;
311                 case Escape::Backreference:
312                 case Escape::WordBoundaryAssertion: {
313                     ASSERT_NOT_REACHED();
314                     break;
315                 }
316             }
317             break;
318         }
319 
320         default:
321             consume();
322             constructor.put(ch);
323         }
324     }
325     consume();
326 
327     // lazily catch reversed ranges ([z-a])in character classes
328     if (constructor.isUpsideDown()) {
329         setError(CharacterClassOutOfOrder);
330         return false;
331     }
332 
333     constructor.flush();
334     CharacterClass charClass = constructor.charClass();
335     return parseCharacterClassQuantifier(failures, charClass, invert);
336 }
337 
parseNonCharacterEscape(JumpList & failures,const Escape & escape)338 bool Parser::parseNonCharacterEscape(JumpList& failures, const Escape& escape)
339 {
340     switch (escape.type()) {
341         case Escape::PatternCharacter:
342             ASSERT_NOT_REACHED();
343             return false;
344 
345         case Escape::CharacterClass:
346             return parseCharacterClassQuantifier(failures, CharacterClassEscape::cast(escape).characterClass(), CharacterClassEscape::cast(escape).invert());
347 
348         case Escape::Backreference:
349             return parseBackreferenceQuantifier(failures, BackreferenceEscape::cast(escape).subpatternId());
350 
351         case Escape::WordBoundaryAssertion:
352             m_generator.generateAssertionWordBoundary(failures, WordBoundaryAssertionEscape::cast(escape).invert());
353             return true;
354 
355         case Escape::Error:
356             return false;
357     }
358 
359     ASSERT_NOT_REACHED();
360     return false;
361 }
362 
consumeEscape(bool inCharacterClass)363 Escape Parser::consumeEscape(bool inCharacterClass)
364 {
365     switch (peek()) {
366     case EndOfPattern:
367         setError(EscapeUnterminated);
368         return Escape(Escape::Error);
369 
370     // Assertions
371     case 'b':
372         consume();
373         if (inCharacterClass)
374             return PatternCharacterEscape('\b');
375         return WordBoundaryAssertionEscape(false); // do not invert
376     case 'B':
377         consume();
378         if (inCharacterClass)
379             return PatternCharacterEscape('B');
380         return WordBoundaryAssertionEscape(true); // invert
381 
382     // CharacterClassEscape
383     case 'd':
384         consume();
385         return CharacterClassEscape(CharacterClass::digits(), false);
386     case 's':
387         consume();
388         return CharacterClassEscape(CharacterClass::spaces(), false);
389     case 'w':
390         consume();
391         return CharacterClassEscape(CharacterClass::wordchar(), false);
392     case 'D':
393         consume();
394         return inCharacterClass
395             ? CharacterClassEscape(CharacterClass::nondigits(), false)
396             : CharacterClassEscape(CharacterClass::digits(), true);
397     case 'S':
398         consume();
399         return inCharacterClass
400             ? CharacterClassEscape(CharacterClass::nonspaces(), false)
401             : CharacterClassEscape(CharacterClass::spaces(), true);
402     case 'W':
403         consume();
404         return inCharacterClass
405             ? CharacterClassEscape(CharacterClass::nonwordchar(), false)
406             : CharacterClassEscape(CharacterClass::wordchar(), true);
407 
408     // DecimalEscape
409     case '1':
410     case '2':
411     case '3':
412     case '4':
413     case '5':
414     case '6':
415     case '7':
416     case '8':
417     case '9': {
418         if (peekDigit() > m_numSubpatterns || inCharacterClass) {
419             // To match Firefox, we parse an invalid backreference in the range [1-7]
420             // as an octal escape.
421             return peekDigit() > 7 ? PatternCharacterEscape('\\') : PatternCharacterEscape(consumeOctal());
422         }
423 
424         int value = 0;
425         do {
426             unsigned newValue = value * 10 + peekDigit();
427             if (newValue > m_numSubpatterns)
428                 break;
429             value = newValue;
430             consume();
431         } while (peekIsDigit());
432 
433         return BackreferenceEscape(value);
434     }
435 
436     // Octal escape
437     case '0':
438         consume();
439         return PatternCharacterEscape(consumeOctal());
440 
441     // ControlEscape
442     case 'f':
443         consume();
444         return PatternCharacterEscape('\f');
445     case 'n':
446         consume();
447         return PatternCharacterEscape('\n');
448     case 'r':
449         consume();
450         return PatternCharacterEscape('\r');
451     case 't':
452         consume();
453         return PatternCharacterEscape('\t');
454     case 'v':
455         consume();
456         return PatternCharacterEscape('\v');
457 
458     // ControlLetter
459     case 'c': {
460         SavedState state(*this);
461         consume();
462 
463         int control = consume();
464         // To match Firefox, inside a character class, we also accept numbers
465         // and '_' as control characters.
466         if ((!inCharacterClass && !isASCIIAlpha(control)) || (!isASCIIAlphanumeric(control) && control != '_')) {
467             state.restore();
468             return PatternCharacterEscape('\\');
469         }
470         return PatternCharacterEscape(control & 31);
471     }
472 
473     // HexEscape
474     case 'x': {
475         consume();
476 
477         SavedState state(*this);
478         int x = consumeHex(2);
479         if (x == -1) {
480             state.restore();
481             return PatternCharacterEscape('x');
482         }
483         return PatternCharacterEscape(x);
484     }
485 
486     // UnicodeEscape
487     case 'u': {
488         consume();
489 
490         SavedState state(*this);
491         int x = consumeHex(4);
492         if (x == -1) {
493             state.restore();
494             return PatternCharacterEscape('u');
495         }
496         return PatternCharacterEscape(x);
497     }
498 
499     // IdentityEscape
500     default:
501         return PatternCharacterEscape(consume());
502     }
503 }
504 
parseAlternative(JumpList & failures)505 void Parser::parseAlternative(JumpList& failures)
506 {
507     PatternCharacterSequence sequence(m_generator, failures);
508 
509     while (1) {
510         switch (peek()) {
511         case EndOfPattern:
512         case '|':
513         case ')':
514             sequence.flush();
515             return;
516 
517         case '*':
518         case '+':
519         case '?':
520         case '{': {
521             Quantifier q = consumeQuantifier();
522 
523             if (q.type == Quantifier::None) {
524                 sequence.append(consume());
525                 continue;
526             }
527 
528             if (q.type == Quantifier::Error)
529                 return;
530 
531             if (!sequence.size()) {
532                 setError(QuantifierWithoutAtom);
533                 return;
534             }
535 
536             sequence.flush(q);
537             continue;
538         }
539 
540         case '^':
541             consume();
542 
543             sequence.flush();
544             m_generator.generateAssertionBOL(failures);
545             continue;
546 
547         case '$':
548             consume();
549 
550             sequence.flush();
551             m_generator.generateAssertionEOL(failures);
552             continue;
553 
554         case '.':
555             consume();
556 
557             sequence.flush();
558             if (!parseCharacterClassQuantifier(failures, CharacterClass::newline(), true))
559                 return;
560             continue;
561 
562         case '[':
563             consume();
564 
565             sequence.flush();
566             if (!parseCharacterClass(failures))
567                 return;
568             continue;
569 
570         case '(':
571             consume();
572 
573             sequence.flush();
574             if (!parseParentheses(failures))
575                 return;
576             continue;
577 
578         case '\\': {
579             consume();
580 
581             Escape escape = consumeEscape(false);
582             if (escape.type() == Escape::PatternCharacter) {
583                 sequence.append(PatternCharacterEscape::cast(escape).character());
584                 continue;
585             }
586 
587             sequence.flush();
588             if (!parseNonCharacterEscape(failures, escape))
589                 return;
590             continue;
591         }
592 
593         default:
594             sequence.append(consume());
595             continue;
596         }
597     }
598 }
599 
600 /*
601   TOS holds index.
602 */
parseDisjunction(JumpList & failures)603 void Parser::parseDisjunction(JumpList& failures)
604 {
605     parseAlternative(failures);
606     if (peek() != '|')
607         return;
608 
609     JumpList successes;
610     do {
611         consume();
612         m_generator.terminateAlternative(successes, failures);
613         parseAlternative(failures);
614     } while (peek() == '|');
615 
616     m_generator.terminateDisjunction(successes);
617 }
618 
consumeParenthesesType()619 Generator::ParenthesesType Parser::consumeParenthesesType()
620 {
621     if (peek() != '?')
622         return Generator::Capturing;
623     consume();
624 
625     switch (consume()) {
626     case ':':
627         return Generator::NonCapturing;
628 
629     case '=':
630         return Generator::Assertion;
631 
632     case '!':
633         return Generator::InvertedAssertion;
634 
635     default:
636         setError(ParenthesesTypeInvalid);
637         return Generator::Error;
638     }
639 }
640 
641 } } // namespace JSC::WREC
642 
643 #endif // ENABLE(WREC)
644