• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2009 Apple Inc. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  * 1. Redistributions of source code must retain the above copyright
8  *    notice, this list of conditions and the following disclaimer.
9  * 2. Redistributions in binary form must reproduce the above copyright
10  *    notice, this list of conditions and the following disclaimer in the
11  *    documentation and/or other materials provided with the distribution.
12  *
13  * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
14  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
16  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
17  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
18  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
19  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
20  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
21  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
23  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24  */
25 
26 #ifndef YarrParser_h
27 #define YarrParser_h
28 
29 #include <runtime/UString.h>
30 #include "Yarr.h"
31 #include <wtf/ASCIICType.h>
32 #include <wtf/unicode/Unicode.h>
33 
34 namespace JSC { namespace Yarr {
35 
36 #define REGEXP_ERROR_PREFIX "Invalid regular expression: "
37 
38 enum BuiltInCharacterClassID {
39     DigitClassID,
40     SpaceClassID,
41     WordClassID,
42     NewlineClassID,
43 };
44 
45 // The Parser class should not be used directly - only via the Yarr::parse() method.
46 template<class Delegate>
47 class Parser {
48 private:
49     template<class FriendDelegate>
50     friend const char* parse(FriendDelegate& delegate, const UString& pattern, unsigned backReferenceLimit);
51 
52     enum ErrorCode {
53         NoError,
54         PatternTooLarge,
55         QuantifierOutOfOrder,
56         QuantifierWithoutAtom,
57         MissingParentheses,
58         ParenthesesUnmatched,
59         ParenthesesTypeInvalid,
60         CharacterClassUnmatched,
61         CharacterClassOutOfOrder,
62         EscapeUnterminated,
63         NumberOfErrorCodes
64     };
65 
66     /*
67      * CharacterClassParserDelegate:
68      *
69      * The class CharacterClassParserDelegate is used in the parsing of character
70      * classes.  This class handles detection of character ranges.  This class
71      * implements enough of the delegate interface such that it can be passed to
72      * parseEscape() as an EscapeDelegate.  This allows parseEscape() to be reused
73      * to perform the parsing of escape characters in character sets.
74      */
75     class CharacterClassParserDelegate {
76     public:
CharacterClassParserDelegate(Delegate & delegate,ErrorCode & err)77         CharacterClassParserDelegate(Delegate& delegate, ErrorCode& err)
78             : m_delegate(delegate)
79             , m_err(err)
80             , m_state(Empty)
81             , m_character(0)
82         {
83         }
84 
85         /*
86          * begin():
87          *
88          * Called at beginning of construction.
89          */
begin(bool invert)90         void begin(bool invert)
91         {
92             m_delegate.atomCharacterClassBegin(invert);
93         }
94 
95         /*
96          * atomPatternCharacter():
97          *
98          * This method is called either from parseCharacterClass() (for an unescaped
99          * character in a character class), or from parseEscape(). In the former case
100          * the value true will be passed for the argument 'hyphenIsRange', and in this
101          * mode we will allow a hypen to be treated as indicating a range (i.e. /[a-z]/
102          * is different to /[a\-z]/).
103          */
104         void atomPatternCharacter(UChar ch, bool hyphenIsRange = false)
105         {
106             switch (m_state) {
107             case AfterCharacterClass:
108                 // Following a builtin character class we need look out for a hyphen.
109                 // We're looking for invalid ranges, such as /[\d-x]/ or /[\d-\d]/.
110                 // If we see a hyphen following a charater class then unlike usual
111                 // we'll report it to the delegate immediately, and put ourself into
112                 // a poisoned state. Any following calls to add another character or
113                 // character class will result in an error. (A hypen following a
114                 // character-class is itself valid, but only  at the end of a regex).
115                 if (hyphenIsRange && ch == '-') {
116                     m_delegate.atomCharacterClassAtom('-');
117                     m_state = AfterCharacterClassHyphen;
118                     return;
119                 }
120                 // Otherwise just fall through - cached character so treat this as Empty.
121 
122             case Empty:
123                 m_character = ch;
124                 m_state = CachedCharacter;
125                 return;
126 
127             case CachedCharacter:
128                 if (hyphenIsRange && ch == '-')
129                     m_state = CachedCharacterHyphen;
130                 else {
131                     m_delegate.atomCharacterClassAtom(m_character);
132                     m_character = ch;
133                 }
134                 return;
135 
136             case CachedCharacterHyphen:
137                 if (ch < m_character) {
138                     m_err = CharacterClassOutOfOrder;
139                     return;
140                 }
141                 m_delegate.atomCharacterClassRange(m_character, ch);
142                 m_state = Empty;
143                 return;
144 
145                 // See coment in atomBuiltInCharacterClass below.
146                 // This too is technically an error, per ECMA-262, and again we
147                 // we chose to allow this.  Note a subtlely here that while we
148                 // diverge from the spec's definition of CharacterRange we do
149                 // remain in compliance with the grammar.  For example, consider
150                 // the expression /[\d-a-z]/.  We comply with the grammar in
151                 // this case by not allowing a-z to be matched as a range.
152             case AfterCharacterClassHyphen:
153                 m_delegate.atomCharacterClassAtom(ch);
154                 m_state = Empty;
155                 return;
156             }
157         }
158 
159         /*
160          * atomBuiltInCharacterClass():
161          *
162          * Adds a built-in character class, called by parseEscape().
163          */
atomBuiltInCharacterClass(BuiltInCharacterClassID classID,bool invert)164         void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert)
165         {
166             switch (m_state) {
167             case CachedCharacter:
168                 // Flush the currently cached character, then fall through.
169                 m_delegate.atomCharacterClassAtom(m_character);
170 
171             case Empty:
172             case AfterCharacterClass:
173                 m_state = AfterCharacterClass;
174                 m_delegate.atomCharacterClassBuiltIn(classID, invert);
175                 return;
176 
177                 // If we hit either of these cases, we have an invalid range that
178                 // looks something like /[x-\d]/ or /[\d-\d]/.
179                 // According to ECMA-262 this should be a syntax error, but
180                 // empirical testing shows this to break teh webz.  Instead we
181                 // comply with to the ECMA-262 grammar, and assume the grammar to
182                 // have matched the range correctly, but tweak our interpretation
183                 // of CharacterRange.  Effectively we implicitly handle the hyphen
184                 // as if it were escaped, e.g. /[\w-_]/ is treated as /[\w\-_]/.
185             case CachedCharacterHyphen:
186                 m_delegate.atomCharacterClassAtom(m_character);
187                 m_delegate.atomCharacterClassAtom('-');
188                 // fall through
189             case AfterCharacterClassHyphen:
190                 m_delegate.atomCharacterClassBuiltIn(classID, invert);
191                 m_state = Empty;
192                 return;
193             }
194         }
195 
196         /*
197          * end():
198          *
199          * Called at end of construction.
200          */
end()201         void end()
202         {
203             if (m_state == CachedCharacter)
204                 m_delegate.atomCharacterClassAtom(m_character);
205             else if (m_state == CachedCharacterHyphen) {
206                 m_delegate.atomCharacterClassAtom(m_character);
207                 m_delegate.atomCharacterClassAtom('-');
208             }
209             m_delegate.atomCharacterClassEnd();
210         }
211 
212         // parseEscape() should never call these delegate methods when
213         // invoked with inCharacterClass set.
assertionWordBoundary(bool)214         void assertionWordBoundary(bool) { ASSERT_NOT_REACHED(); }
atomBackReference(unsigned)215         void atomBackReference(unsigned) { ASSERT_NOT_REACHED(); }
216 
217     private:
218         Delegate& m_delegate;
219         ErrorCode& m_err;
220         enum CharacterClassConstructionState {
221             Empty,
222             CachedCharacter,
223             CachedCharacterHyphen,
224             AfterCharacterClass,
225             AfterCharacterClassHyphen,
226         } m_state;
227         UChar m_character;
228     };
229 
Parser(Delegate & delegate,const UString & pattern,unsigned backReferenceLimit)230     Parser(Delegate& delegate, const UString& pattern, unsigned backReferenceLimit)
231         : m_delegate(delegate)
232         , m_backReferenceLimit(backReferenceLimit)
233         , m_err(NoError)
234         , m_data(pattern.characters())
235         , m_size(pattern.length())
236         , m_index(0)
237         , m_parenthesesNestingDepth(0)
238     {
239     }
240 
241     /*
242      * parseEscape():
243      *
244      * Helper for parseTokens() AND parseCharacterClass().
245      * Unlike the other parser methods, this function does not report tokens
246      * directly to the member delegate (m_delegate), instead tokens are
247      * emitted to the delegate provided as an argument.  In the case of atom
248      * escapes, parseTokens() will call parseEscape() passing m_delegate as
249      * an argument, and as such the escape will be reported to the delegate.
250      *
251      * However this method may also be used by parseCharacterClass(), in which
252      * case a CharacterClassParserDelegate will be passed as the delegate that
253      * tokens should be added to.  A boolean flag is also provided to indicate
254      * whether that an escape in a CharacterClass is being parsed (some parsing
255      * rules change in this context).
256      *
257      * The boolean value returned by this method indicates whether the token
258      * parsed was an atom (outside of a characted class \b and \B will be
259      * interpreted as assertions).
260      */
261     template<bool inCharacterClass, class EscapeDelegate>
parseEscape(EscapeDelegate & delegate)262     bool parseEscape(EscapeDelegate& delegate)
263     {
264         ASSERT(!m_err);
265         ASSERT(peek() == '\\');
266         consume();
267 
268         if (atEndOfPattern()) {
269             m_err = EscapeUnterminated;
270             return false;
271         }
272 
273         switch (peek()) {
274         // Assertions
275         case 'b':
276             consume();
277             if (inCharacterClass)
278                 delegate.atomPatternCharacter('\b');
279             else {
280                 delegate.assertionWordBoundary(false);
281                 return false;
282             }
283             break;
284         case 'B':
285             consume();
286             if (inCharacterClass)
287                 delegate.atomPatternCharacter('B');
288             else {
289                 delegate.assertionWordBoundary(true);
290                 return false;
291             }
292             break;
293 
294         // CharacterClassEscape
295         case 'd':
296             consume();
297             delegate.atomBuiltInCharacterClass(DigitClassID, false);
298             break;
299         case 's':
300             consume();
301             delegate.atomBuiltInCharacterClass(SpaceClassID, false);
302             break;
303         case 'w':
304             consume();
305             delegate.atomBuiltInCharacterClass(WordClassID, false);
306             break;
307         case 'D':
308             consume();
309             delegate.atomBuiltInCharacterClass(DigitClassID, true);
310             break;
311         case 'S':
312             consume();
313             delegate.atomBuiltInCharacterClass(SpaceClassID, true);
314             break;
315         case 'W':
316             consume();
317             delegate.atomBuiltInCharacterClass(WordClassID, true);
318             break;
319 
320         // DecimalEscape
321         case '1':
322         case '2':
323         case '3':
324         case '4':
325         case '5':
326         case '6':
327         case '7':
328         case '8':
329         case '9': {
330             // To match Firefox, we parse an invalid backreference in the range [1-7] as an octal escape.
331             // First, try to parse this as backreference.
332             if (!inCharacterClass) {
333                 ParseState state = saveState();
334 
335                 unsigned backReference = consumeNumber();
336                 if (backReference <= m_backReferenceLimit) {
337                     delegate.atomBackReference(backReference);
338                     break;
339                 }
340 
341                 restoreState(state);
342             }
343 
344             // Not a backreference, and not octal.
345             if (peek() >= '8') {
346                 delegate.atomPatternCharacter('\\');
347                 break;
348             }
349 
350             // Fall-through to handle this as an octal escape.
351         }
352 
353         // Octal escape
354         case '0':
355             delegate.atomPatternCharacter(consumeOctal());
356             break;
357 
358         // ControlEscape
359         case 'f':
360             consume();
361             delegate.atomPatternCharacter('\f');
362             break;
363         case 'n':
364             consume();
365             delegate.atomPatternCharacter('\n');
366             break;
367         case 'r':
368             consume();
369             delegate.atomPatternCharacter('\r');
370             break;
371         case 't':
372             consume();
373             delegate.atomPatternCharacter('\t');
374             break;
375         case 'v':
376             consume();
377             delegate.atomPatternCharacter('\v');
378             break;
379 
380         // ControlLetter
381         case 'c': {
382             ParseState state = saveState();
383             consume();
384             if (!atEndOfPattern()) {
385                 int control = consume();
386 
387                 // To match Firefox, inside a character class, we also accept numbers and '_' as control characters.
388                 if (inCharacterClass ? WTF::isASCIIAlphanumeric(control) || (control == '_') : WTF::isASCIIAlpha(control)) {
389                     delegate.atomPatternCharacter(control & 0x1f);
390                     break;
391                 }
392             }
393             restoreState(state);
394             delegate.atomPatternCharacter('\\');
395             break;
396         }
397 
398         // HexEscape
399         case 'x': {
400             consume();
401             int x = tryConsumeHex(2);
402             if (x == -1)
403                 delegate.atomPatternCharacter('x');
404             else
405                 delegate.atomPatternCharacter(x);
406             break;
407         }
408 
409         // UnicodeEscape
410         case 'u': {
411             consume();
412             int u = tryConsumeHex(4);
413             if (u == -1)
414                 delegate.atomPatternCharacter('u');
415             else
416                 delegate.atomPatternCharacter(u);
417             break;
418         }
419 
420         // IdentityEscape
421         default:
422             delegate.atomPatternCharacter(consume());
423         }
424 
425         return true;
426     }
427 
428     /*
429      * parseAtomEscape(), parseCharacterClassEscape():
430      *
431      * These methods alias to parseEscape().
432      */
parseAtomEscape()433     bool parseAtomEscape()
434     {
435         return parseEscape<false>(m_delegate);
436     }
parseCharacterClassEscape(CharacterClassParserDelegate & delegate)437     void parseCharacterClassEscape(CharacterClassParserDelegate& delegate)
438     {
439         parseEscape<true>(delegate);
440     }
441 
442     /*
443      * parseCharacterClass():
444      *
445      * Helper for parseTokens(); calls dirctly and indirectly (via parseCharacterClassEscape)
446      * to an instance of CharacterClassParserDelegate, to describe the character class to the
447      * delegate.
448      */
parseCharacterClass()449     void parseCharacterClass()
450     {
451         ASSERT(!m_err);
452         ASSERT(peek() == '[');
453         consume();
454 
455         CharacterClassParserDelegate characterClassConstructor(m_delegate, m_err);
456 
457         characterClassConstructor.begin(tryConsume('^'));
458 
459         while (!atEndOfPattern()) {
460             switch (peek()) {
461             case ']':
462                 consume();
463                 characterClassConstructor.end();
464                 return;
465 
466             case '\\':
467                 parseCharacterClassEscape(characterClassConstructor);
468                 break;
469 
470             default:
471                 characterClassConstructor.atomPatternCharacter(consume(), true);
472             }
473 
474             if (m_err)
475                 return;
476         }
477 
478         m_err = CharacterClassUnmatched;
479     }
480 
481     /*
482      * parseParenthesesBegin():
483      *
484      * Helper for parseTokens(); checks for parentheses types other than regular capturing subpatterns.
485      */
parseParenthesesBegin()486     void parseParenthesesBegin()
487     {
488         ASSERT(!m_err);
489         ASSERT(peek() == '(');
490         consume();
491 
492         if (tryConsume('?')) {
493             if (atEndOfPattern()) {
494                 m_err = ParenthesesTypeInvalid;
495                 return;
496             }
497 
498             switch (consume()) {
499             case ':':
500                 m_delegate.atomParenthesesSubpatternBegin(false);
501                 break;
502 
503             case '=':
504                 m_delegate.atomParentheticalAssertionBegin();
505                 break;
506 
507             case '!':
508                 m_delegate.atomParentheticalAssertionBegin(true);
509                 break;
510 
511             default:
512                 m_err = ParenthesesTypeInvalid;
513             }
514         } else
515             m_delegate.atomParenthesesSubpatternBegin();
516 
517         ++m_parenthesesNestingDepth;
518     }
519 
520     /*
521      * parseParenthesesEnd():
522      *
523      * Helper for parseTokens(); checks for parse errors (due to unmatched parentheses).
524      */
parseParenthesesEnd()525     void parseParenthesesEnd()
526     {
527         ASSERT(!m_err);
528         ASSERT(peek() == ')');
529         consume();
530 
531         if (m_parenthesesNestingDepth > 0)
532             m_delegate.atomParenthesesEnd();
533         else
534             m_err = ParenthesesUnmatched;
535 
536         --m_parenthesesNestingDepth;
537     }
538 
539     /*
540      * parseQuantifier():
541      *
542      * Helper for parseTokens(); checks for parse errors and non-greedy quantifiers.
543      */
parseQuantifier(bool lastTokenWasAnAtom,unsigned min,unsigned max)544     void parseQuantifier(bool lastTokenWasAnAtom, unsigned min, unsigned max)
545     {
546         ASSERT(!m_err);
547         ASSERT(min <= max);
548 
549         if (lastTokenWasAnAtom)
550             m_delegate.quantifyAtom(min, max, !tryConsume('?'));
551         else
552             m_err = QuantifierWithoutAtom;
553     }
554 
555     /*
556      * parseTokens():
557      *
558      * This method loops over the input pattern reporting tokens to the delegate.
559      * The method returns when a parse error is detected, or the end of the pattern
560      * is reached.  One piece of state is tracked around the loop, which is whether
561      * the last token passed to the delegate was an atom (this is necessary to detect
562      * a parse error when a quantifier provided without an atom to quantify).
563      */
parseTokens()564     void parseTokens()
565     {
566         bool lastTokenWasAnAtom = false;
567 
568         while (!atEndOfPattern()) {
569             switch (peek()) {
570             case '|':
571                 consume();
572                 m_delegate.disjunction();
573                 lastTokenWasAnAtom = false;
574                 break;
575 
576             case '(':
577                 parseParenthesesBegin();
578                 lastTokenWasAnAtom = false;
579                 break;
580 
581             case ')':
582                 parseParenthesesEnd();
583                 lastTokenWasAnAtom = true;
584                 break;
585 
586             case '^':
587                 consume();
588                 m_delegate.assertionBOL();
589                 lastTokenWasAnAtom = false;
590                 break;
591 
592             case '$':
593                 consume();
594                 m_delegate.assertionEOL();
595                 lastTokenWasAnAtom = false;
596                 break;
597 
598             case '.':
599                 consume();
600                 m_delegate.atomBuiltInCharacterClass(NewlineClassID, true);
601                 lastTokenWasAnAtom = true;
602                 break;
603 
604             case '[':
605                 parseCharacterClass();
606                 lastTokenWasAnAtom = true;
607                 break;
608 
609             case '\\':
610                 lastTokenWasAnAtom = parseAtomEscape();
611                 break;
612 
613             case '*':
614                 consume();
615                 parseQuantifier(lastTokenWasAnAtom, 0, quantifyInfinite);
616                 lastTokenWasAnAtom = false;
617                 break;
618 
619             case '+':
620                 consume();
621                 parseQuantifier(lastTokenWasAnAtom, 1, quantifyInfinite);
622                 lastTokenWasAnAtom = false;
623                 break;
624 
625             case '?':
626                 consume();
627                 parseQuantifier(lastTokenWasAnAtom, 0, 1);
628                 lastTokenWasAnAtom = false;
629                 break;
630 
631             case '{': {
632                 ParseState state = saveState();
633 
634                 consume();
635                 if (peekIsDigit()) {
636                     unsigned min = consumeNumber();
637                     unsigned max = min;
638 
639                     if (tryConsume(','))
640                         max = peekIsDigit() ? consumeNumber() : quantifyInfinite;
641 
642                     if (tryConsume('}')) {
643                         if (min <= max)
644                             parseQuantifier(lastTokenWasAnAtom, min, max);
645                         else
646                             m_err = QuantifierOutOfOrder;
647                         lastTokenWasAnAtom = false;
648                         break;
649                     }
650                 }
651 
652                 restoreState(state);
653             } // if we did not find a complete quantifer, fall through to the default case.
654 
655             default:
656                 m_delegate.atomPatternCharacter(consume());
657                 lastTokenWasAnAtom = true;
658             }
659 
660             if (m_err)
661                 return;
662         }
663 
664         if (m_parenthesesNestingDepth > 0)
665             m_err = MissingParentheses;
666     }
667 
668     /*
669      * parse():
670      *
671      * This method calls parseTokens() to parse over the input and converts any
672      * error code to a const char* for a result.
673      */
parse()674     const char* parse()
675     {
676         if (m_size > MAX_PATTERN_SIZE)
677             m_err = PatternTooLarge;
678         else
679             parseTokens();
680         ASSERT(atEndOfPattern() || m_err);
681 
682         // The order of this array must match the ErrorCode enum.
683         static const char* errorMessages[NumberOfErrorCodes] = {
684             0, // NoError
685             REGEXP_ERROR_PREFIX "regular expression too large",
686             REGEXP_ERROR_PREFIX "numbers out of order in {} quantifier",
687             REGEXP_ERROR_PREFIX "nothing to repeat",
688             REGEXP_ERROR_PREFIX "missing )",
689             REGEXP_ERROR_PREFIX "unmatched parentheses",
690             REGEXP_ERROR_PREFIX "unrecognized character after (?",
691             REGEXP_ERROR_PREFIX "missing terminating ] for character class",
692             REGEXP_ERROR_PREFIX "range out of order in character class",
693             REGEXP_ERROR_PREFIX "\\ at end of pattern"
694         };
695 
696         return errorMessages[m_err];
697     }
698 
699 
700     // Misc helper functions:
701 
702     typedef unsigned ParseState;
703 
saveState()704     ParseState saveState()
705     {
706         return m_index;
707     }
708 
restoreState(ParseState state)709     void restoreState(ParseState state)
710     {
711         m_index = state;
712     }
713 
atEndOfPattern()714     bool atEndOfPattern()
715     {
716         ASSERT(m_index <= m_size);
717         return m_index == m_size;
718     }
719 
peek()720     int peek()
721     {
722         ASSERT(m_index < m_size);
723         return m_data[m_index];
724     }
725 
peekIsDigit()726     bool peekIsDigit()
727     {
728         return !atEndOfPattern() && WTF::isASCIIDigit(peek());
729     }
730 
peekDigit()731     unsigned peekDigit()
732     {
733         ASSERT(peekIsDigit());
734         return peek() - '0';
735     }
736 
consume()737     int consume()
738     {
739         ASSERT(m_index < m_size);
740         return m_data[m_index++];
741     }
742 
consumeDigit()743     unsigned consumeDigit()
744     {
745         ASSERT(peekIsDigit());
746         return consume() - '0';
747     }
748 
consumeNumber()749     unsigned consumeNumber()
750     {
751         unsigned n = consumeDigit();
752         // check for overflow.
753         for (unsigned newValue; peekIsDigit() && ((newValue = n * 10 + peekDigit()) >= n); ) {
754             n = newValue;
755             consume();
756         }
757         return n;
758     }
759 
consumeOctal()760     unsigned consumeOctal()
761     {
762         ASSERT(WTF::isASCIIOctalDigit(peek()));
763 
764         unsigned n = consumeDigit();
765         while (n < 32 && !atEndOfPattern() && WTF::isASCIIOctalDigit(peek()))
766             n = n * 8 + consumeDigit();
767         return n;
768     }
769 
tryConsume(UChar ch)770     bool tryConsume(UChar ch)
771     {
772         if (atEndOfPattern() || (m_data[m_index] != ch))
773             return false;
774         ++m_index;
775         return true;
776     }
777 
tryConsumeHex(int count)778     int tryConsumeHex(int count)
779     {
780         ParseState state = saveState();
781 
782         int n = 0;
783         while (count--) {
784             if (atEndOfPattern() || !WTF::isASCIIHexDigit(peek())) {
785                 restoreState(state);
786                 return -1;
787             }
788             n = (n << 4) | WTF::toASCIIHexValue(consume());
789         }
790         return n;
791     }
792 
793     Delegate& m_delegate;
794     unsigned m_backReferenceLimit;
795     ErrorCode m_err;
796     const UChar* m_data;
797     unsigned m_size;
798     unsigned m_index;
799     unsigned m_parenthesesNestingDepth;
800 
801     // Derived by empirical testing of compile time in PCRE and WREC.
802     static const unsigned MAX_PATTERN_SIZE = 1024 * 1024;
803 };
804 
805 /*
806  * Yarr::parse():
807  *
808  * The parse method is passed a pattern to be parsed and a delegate upon which
809  * callbacks will be made to record the parsed tokens forming the regex.
810  * Yarr::parse() returns null on success, or a const C string providing an error
811  * message where a parse error occurs.
812  *
813  * The Delegate must implement the following interface:
814  *
815  *    void assertionBOL();
816  *    void assertionEOL();
817  *    void assertionWordBoundary(bool invert);
818  *
819  *    void atomPatternCharacter(UChar ch);
820  *    void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert);
821  *    void atomCharacterClassBegin(bool invert)
822  *    void atomCharacterClassAtom(UChar ch)
823  *    void atomCharacterClassRange(UChar begin, UChar end)
824  *    void atomCharacterClassBuiltIn(BuiltInCharacterClassID classID, bool invert)
825  *    void atomCharacterClassEnd()
826  *    void atomParenthesesSubpatternBegin(bool capture = true);
827  *    void atomParentheticalAssertionBegin(bool invert = false);
828  *    void atomParenthesesEnd();
829  *    void atomBackReference(unsigned subpatternId);
830  *
831  *    void quantifyAtom(unsigned min, unsigned max, bool greedy);
832  *
833  *    void disjunction();
834  *
835  * The regular expression is described by a sequence of assertion*() and atom*()
836  * callbacks to the delegate, describing the terms in the regular expression.
837  * Following an atom a quantifyAtom() call may occur to indicate that the previous
838  * atom should be quantified.  In the case of atoms described across multiple
839  * calls (parentheses and character classes) the call to quantifyAtom() will come
840  * after the call to the atom*End() method, never after atom*Begin().
841  *
842  * Character classes may either be described by a single call to
843  * atomBuiltInCharacterClass(), or by a sequence of atomCharacterClass*() calls.
844  * In the latter case, ...Begin() will be called, followed by a sequence of
845  * calls to ...Atom(), ...Range(), and ...BuiltIn(), followed by a call to ...End().
846  *
847  * Sequences of atoms and assertions are broken into alternatives via calls to
848  * disjunction().  Assertions, atoms, and disjunctions emitted between calls to
849  * atomParenthesesBegin() and atomParenthesesEnd() form the body of a subpattern.
850  * atomParenthesesBegin() is passed a subpatternId.  In the case of a regular
851  * capturing subpattern, this will be the subpatternId associated with these
852  * parentheses, and will also by definition be the lowest subpatternId of these
853  * parentheses and of any nested paretheses.  The atomParenthesesEnd() method
854  * is passed the subpatternId of the last capturing subexpression nested within
855  * these paretheses.  In the case of a capturing subpattern with no nested
856  * capturing subpatterns, the same subpatternId will be passed to the begin and
857  * end functions.  In the case of non-capturing subpatterns the subpatternId
858  * passed to the begin method is also the first possible subpatternId that might
859  * be nested within these paretheses.  If a set of non-capturing parentheses does
860  * not contain any capturing subpatterns, then the subpatternId passed to begin
861  * will be greater than the subpatternId passed to end.
862  */
863 
864 template<class Delegate>
865 const char* parse(Delegate& delegate, const UString& pattern, unsigned backReferenceLimit = quantifyInfinite)
866 {
867     return Parser<Delegate>(delegate, pattern, backReferenceLimit).parse();
868 }
869 
870 } } // namespace JSC::Yarr
871 
872 #endif // YarrParser_h
873