• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2009 Apple Inc. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  * 1. Redistributions of source code must retain the above copyright
8  *    notice, this list of conditions and the following disclaimer.
9  * 2. Redistributions in binary form must reproduce the above copyright
10  *    notice, this list of conditions and the following disclaimer in the
11  *    documentation and/or other materials provided with the distribution.
12  *
13  * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
14  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
16  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
17  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
18  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
19  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
20  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
21  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
23  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24  */
25 
26 #ifndef RegexParser_h
27 #define RegexParser_h
28 
29 #include <wtf/Platform.h>
30 
31 #if ENABLE(YARR)
32 
33 #include <UString.h>
34 #include <wtf/ASCIICType.h>
35 #include <wtf/unicode/Unicode.h>
36 #include <limits.h>
37 
38 namespace JSC { namespace Yarr {
39 
40 enum BuiltInCharacterClassID {
41     DigitClassID,
42     SpaceClassID,
43     WordClassID,
44     NewlineClassID,
45 };
46 
47 // The Parser class should not be used directly - only via the Yarr::parse() method.
48 template<class Delegate>
49 class Parser {
50 private:
51     template<class FriendDelegate>
52     friend const char* parse(FriendDelegate& delegate, const UString& pattern, unsigned backReferenceLimit);
53 
54     enum ErrorCode {
55         NoError,
56         PatternTooLarge,
57         QuantifierOutOfOrder,
58         QuantifierWithoutAtom,
59         MissingParentheses,
60         ParenthesesUnmatched,
61         ParenthesesTypeInvalid,
62         CharacterClassUnmatched,
63         CharacterClassOutOfOrder,
64         EscapeUnterminated,
65         NumberOfErrorCodes
66     };
67 
68     /*
69      * CharacterClassParserDelegate:
70      *
71      * The class CharacterClassParserDelegate is used in the parsing of character
72      * classes.  This class handles detection of character ranges.  This class
73      * implements enough of the delegate interface such that it can be passed to
74      * parseEscape() as an EscapeDelegate.  This allows parseEscape() to be reused
75      * to perform the parsing of escape characters in character sets.
76      */
77     class CharacterClassParserDelegate {
78     public:
CharacterClassParserDelegate(Delegate & delegate,ErrorCode & err)79         CharacterClassParserDelegate(Delegate& delegate, ErrorCode& err)
80             : m_delegate(delegate)
81             , m_err(err)
82             , m_state(empty)
83         {
84         }
85 
86         /*
87          * begin():
88          *
89          * Called at beginning of construction.
90          */
begin(bool invert)91         void begin(bool invert)
92         {
93             m_delegate.atomCharacterClassBegin(invert);
94         }
95 
96         /*
97          * atomPatternCharacterUnescaped():
98          *
99          * This method is called directly from parseCharacterClass(), to report a new
100          * pattern character token.  This method differs from atomPatternCharacter(),
101          * which will be called from parseEscape(), since a hypen provided via this
102          * method may be indicating a character range, but a hyphen parsed by
103          * parseEscape() cannot be interpreted as doing so.
104          */
atomPatternCharacterUnescaped(UChar ch)105         void atomPatternCharacterUnescaped(UChar ch)
106         {
107             switch (m_state) {
108             case empty:
109                 m_character = ch;
110                 m_state = cachedCharacter;
111                 break;
112 
113             case cachedCharacter:
114                 if (ch == '-')
115                     m_state = cachedCharacterHyphen;
116                 else {
117                     m_delegate.atomCharacterClassAtom(m_character);
118                     m_character = ch;
119                 }
120                 break;
121 
122             case cachedCharacterHyphen:
123                 if (ch >= m_character)
124                     m_delegate.atomCharacterClassRange(m_character, ch);
125                 else
126                     m_err = CharacterClassOutOfOrder;
127                 m_state = empty;
128             }
129         }
130 
131         /*
132          * atomPatternCharacter():
133          *
134          * Adds a pattern character, called by parseEscape(), as such will not
135          * interpret a hyphen as indicating a character range.
136          */
atomPatternCharacter(UChar ch)137         void atomPatternCharacter(UChar ch)
138         {
139             // Flush if a character is already pending to prevent the
140             // hyphen from begin interpreted as indicating a range.
141             if((ch == '-') && (m_state == cachedCharacter))
142                 flush();
143 
144             atomPatternCharacterUnescaped(ch);
145         }
146 
147         /*
148          * atomBuiltInCharacterClass():
149          *
150          * Adds a built-in character class, called by parseEscape().
151          */
atomBuiltInCharacterClass(BuiltInCharacterClassID classID,bool invert)152         void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert)
153         {
154             flush();
155             m_delegate.atomCharacterClassBuiltIn(classID, invert);
156         }
157 
158         /*
159          * end():
160          *
161          * Called at end of construction.
162          */
end()163         void end()
164         {
165             flush();
166             m_delegate.atomCharacterClassEnd();
167         }
168 
169         // parseEscape() should never call these delegate methods when
170         // invoked with inCharacterClass set.
assertionWordBoundary(bool)171         void assertionWordBoundary(bool) { ASSERT_NOT_REACHED(); }
atomBackReference(unsigned)172         void atomBackReference(unsigned) { ASSERT_NOT_REACHED(); }
173 
174     private:
flush()175         void flush()
176         {
177             if (m_state != empty) // either cachedCharacter or cachedCharacterHyphen
178                 m_delegate.atomCharacterClassAtom(m_character);
179             if (m_state == cachedCharacterHyphen)
180                 m_delegate.atomCharacterClassAtom('-');
181             m_state = empty;
182         }
183 
184         Delegate& m_delegate;
185         ErrorCode& m_err;
186         enum CharacterClassConstructionState {
187             empty,
188             cachedCharacter,
189             cachedCharacterHyphen,
190         } m_state;
191         UChar m_character;
192     };
193 
Parser(Delegate & delegate,const UString & pattern,unsigned backReferenceLimit)194     Parser(Delegate& delegate, const UString& pattern, unsigned backReferenceLimit)
195         : m_delegate(delegate)
196         , m_backReferenceLimit(backReferenceLimit)
197         , m_err(NoError)
198         , m_data(pattern.data())
199         , m_size(pattern.size())
200         , m_index(0)
201         , m_parenthesesNestingDepth(0)
202     {
203     }
204 
205     /*
206      * parseEscape():
207      *
208      * Helper for parseTokens() AND parseCharacterClass().
209      * Unlike the other parser methods, this function does not report tokens
210      * directly to the member delegate (m_delegate), instead tokens are
211      * emitted to the delegate provided as an argument.  In the case of atom
212      * escapes, parseTokens() will call parseEscape() passing m_delegate as
213      * an argument, and as such the escape will be reported to the delegate.
214      *
215      * However this method may also be used by parseCharacterClass(), in which
216      * case a CharacterClassParserDelegate will be passed as the delegate that
217      * tokens should be added to.  A boolean flag is also provided to indicate
218      * whether that an escape in a CharacterClass is being parsed (some parsing
219      * rules change in this context).
220      *
221      * The boolean value returned by this method indicates whether the token
222      * parsed was an atom (outside of a characted class \b and \B will be
223      * interpreted as assertions).
224      */
225     template<bool inCharacterClass, class EscapeDelegate>
parseEscape(EscapeDelegate & delegate)226     bool parseEscape(EscapeDelegate& delegate)
227     {
228         ASSERT(!m_err);
229         ASSERT(peek() == '\\');
230         consume();
231 
232         if (atEndOfPattern()) {
233             m_err = EscapeUnterminated;
234             return false;
235         }
236 
237         switch (peek()) {
238         // Assertions
239         case 'b':
240             consume();
241             if (inCharacterClass)
242                 delegate.atomPatternCharacter('\b');
243             else {
244                 delegate.assertionWordBoundary(false);
245                 return false;
246             }
247             break;
248         case 'B':
249             consume();
250             if (inCharacterClass)
251                 delegate.atomPatternCharacter('B');
252             else {
253                 delegate.assertionWordBoundary(true);
254                 return false;
255             }
256             break;
257 
258         // CharacterClassEscape
259         case 'd':
260             consume();
261             delegate.atomBuiltInCharacterClass(DigitClassID, false);
262             break;
263         case 's':
264             consume();
265             delegate.atomBuiltInCharacterClass(SpaceClassID, false);
266             break;
267         case 'w':
268             consume();
269             delegate.atomBuiltInCharacterClass(WordClassID, false);
270             break;
271         case 'D':
272             consume();
273             delegate.atomBuiltInCharacterClass(DigitClassID, true);
274             break;
275         case 'S':
276             consume();
277             delegate.atomBuiltInCharacterClass(SpaceClassID, true);
278             break;
279         case 'W':
280             consume();
281             delegate.atomBuiltInCharacterClass(WordClassID, true);
282             break;
283 
284         // DecimalEscape
285         case '1':
286         case '2':
287         case '3':
288         case '4':
289         case '5':
290         case '6':
291         case '7':
292         case '8':
293         case '9': {
294             // To match Firefox, we parse an invalid backreference in the range [1-7] as an octal escape.
295             // First, try to parse this as backreference.
296             if (!inCharacterClass) {
297                 ParseState state = saveState();
298 
299                 unsigned backReference = consumeNumber();
300                 if (backReference <= m_backReferenceLimit) {
301                     delegate.atomBackReference(backReference);
302                     break;
303                 }
304 
305                 restoreState(state);
306             }
307 
308             // Not a backreference, and not octal.
309             if (peek() >= '8') {
310                 delegate.atomPatternCharacter('\\');
311                 break;
312             }
313 
314             // Fall-through to handle this as an octal escape.
315         }
316 
317         // Octal escape
318         case '0':
319             delegate.atomPatternCharacter(consumeOctal());
320             break;
321 
322         // ControlEscape
323         case 'f':
324             consume();
325             delegate.atomPatternCharacter('\f');
326             break;
327         case 'n':
328             consume();
329             delegate.atomPatternCharacter('\n');
330             break;
331         case 'r':
332             consume();
333             delegate.atomPatternCharacter('\r');
334             break;
335         case 't':
336             consume();
337             delegate.atomPatternCharacter('\t');
338             break;
339         case 'v':
340             consume();
341             delegate.atomPatternCharacter('\v');
342             break;
343 
344         // ControlLetter
345         case 'c': {
346             ParseState state = saveState();
347             consume();
348             if (!atEndOfPattern()) {
349                 int control = consume();
350 
351                 // To match Firefox, inside a character class, we also accept numbers and '_' as control characters.
352                 if (inCharacterClass ? WTF::isASCIIAlphanumeric(control) || (control == '_') : WTF::isASCIIAlpha(control)) {
353                     delegate.atomPatternCharacter(control & 0x1f);
354                     break;
355                 }
356             }
357             restoreState(state);
358             delegate.atomPatternCharacter('\\');
359             break;
360         }
361 
362         // HexEscape
363         case 'x': {
364             consume();
365             int x = tryConsumeHex(2);
366             if (x == -1)
367                 delegate.atomPatternCharacter('x');
368             else
369                 delegate.atomPatternCharacter(x);
370             break;
371         }
372 
373         // UnicodeEscape
374         case 'u': {
375             consume();
376             int u = tryConsumeHex(4);
377             if (u == -1)
378                 delegate.atomPatternCharacter('u');
379             else
380                 delegate.atomPatternCharacter(u);
381             break;
382         }
383 
384         // IdentityEscape
385         default:
386             delegate.atomPatternCharacter(consume());
387         }
388 
389         return true;
390     }
391 
392     /*
393      * parseAtomEscape(), parseCharacterClassEscape():
394      *
395      * These methods alias to parseEscape().
396      */
parseAtomEscape()397     bool parseAtomEscape()
398     {
399         return parseEscape<false>(m_delegate);
400     }
parseCharacterClassEscape(CharacterClassParserDelegate & delegate)401     void parseCharacterClassEscape(CharacterClassParserDelegate& delegate)
402     {
403         parseEscape<true>(delegate);
404     }
405 
406     /*
407      * parseCharacterClass():
408      *
409      * Helper for parseTokens(); calls dirctly and indirectly (via parseCharacterClassEscape)
410      * to an instance of CharacterClassParserDelegate, to describe the character class to the
411      * delegate.
412      */
parseCharacterClass()413     void parseCharacterClass()
414     {
415         ASSERT(!m_err);
416         ASSERT(peek() == '[');
417         consume();
418 
419         CharacterClassParserDelegate characterClassConstructor(m_delegate, m_err);
420 
421         characterClassConstructor.begin(tryConsume('^'));
422 
423         while (!atEndOfPattern()) {
424             switch (peek()) {
425             case ']':
426                 consume();
427                 characterClassConstructor.end();
428                 return;
429 
430             case '\\':
431                 parseCharacterClassEscape(characterClassConstructor);
432                 break;
433 
434             default:
435                 characterClassConstructor.atomPatternCharacterUnescaped(consume());
436             }
437 
438             if (m_err)
439                 return;
440         }
441 
442         m_err = CharacterClassUnmatched;
443     }
444 
445     /*
446      * parseParenthesesBegin():
447      *
448      * Helper for parseTokens(); checks for parentheses types other than regular capturing subpatterns.
449      */
parseParenthesesBegin()450     void parseParenthesesBegin()
451     {
452         ASSERT(!m_err);
453         ASSERT(peek() == '(');
454         consume();
455 
456         if (tryConsume('?')) {
457             if (atEndOfPattern()) {
458                 m_err = ParenthesesTypeInvalid;
459                 return;
460             }
461 
462             switch (consume()) {
463             case ':':
464                 m_delegate.atomParenthesesSubpatternBegin(false);
465                 break;
466 
467             case '=':
468                 m_delegate.atomParentheticalAssertionBegin();
469                 break;
470 
471             case '!':
472                 m_delegate.atomParentheticalAssertionBegin(true);
473                 break;
474 
475             default:
476                 m_err = ParenthesesTypeInvalid;
477             }
478         } else
479             m_delegate.atomParenthesesSubpatternBegin();
480 
481         ++m_parenthesesNestingDepth;
482     }
483 
484     /*
485      * parseParenthesesEnd():
486      *
487      * Helper for parseTokens(); checks for parse errors (due to unmatched parentheses).
488      */
parseParenthesesEnd()489     void parseParenthesesEnd()
490     {
491         ASSERT(!m_err);
492         ASSERT(peek() == ')');
493         consume();
494 
495         if (m_parenthesesNestingDepth > 0)
496             m_delegate.atomParenthesesEnd();
497         else
498             m_err = ParenthesesUnmatched;
499 
500         --m_parenthesesNestingDepth;
501     }
502 
503     /*
504      * parseQuantifier():
505      *
506      * Helper for parseTokens(); checks for parse errors and non-greedy quantifiers.
507      */
parseQuantifier(bool lastTokenWasAnAtom,unsigned min,unsigned max)508     void parseQuantifier(bool lastTokenWasAnAtom, unsigned min, unsigned max)
509     {
510         ASSERT(!m_err);
511         ASSERT(min <= max);
512 
513         if (lastTokenWasAnAtom)
514             m_delegate.quantifyAtom(min, max, !tryConsume('?'));
515         else
516             m_err = QuantifierWithoutAtom;
517     }
518 
519     /*
520      * parseTokens():
521      *
522      * This method loops over the input pattern reporting tokens to the delegate.
523      * The method returns when a parse error is detected, or the end of the pattern
524      * is reached.  One piece of state is tracked around the loop, which is whether
525      * the last token passed to the delegate was an atom (this is necessary to detect
526      * a parse error when a quantifier provided without an atom to quantify).
527      */
parseTokens()528     void parseTokens()
529     {
530         bool lastTokenWasAnAtom = false;
531 
532         while (!atEndOfPattern()) {
533             switch (peek()) {
534             case '|':
535                 consume();
536                 m_delegate.disjunction();
537                 lastTokenWasAnAtom = false;
538                 break;
539 
540             case '(':
541                 parseParenthesesBegin();
542                 lastTokenWasAnAtom = false;
543                 break;
544 
545             case ')':
546                 parseParenthesesEnd();
547                 lastTokenWasAnAtom = true;
548                 break;
549 
550             case '^':
551                 consume();
552                 m_delegate.assertionBOL();
553                 lastTokenWasAnAtom = false;
554                 break;
555 
556             case '$':
557                 consume();
558                 m_delegate.assertionEOL();
559                 lastTokenWasAnAtom = false;
560                 break;
561 
562             case '.':
563                 consume();
564                 m_delegate.atomBuiltInCharacterClass(NewlineClassID, true);
565                 lastTokenWasAnAtom = true;
566                 break;
567 
568             case '[':
569                 parseCharacterClass();
570                 lastTokenWasAnAtom = true;
571                 break;
572 
573             case '\\':
574                 lastTokenWasAnAtom = parseAtomEscape();
575                 break;
576 
577             case '*':
578                 consume();
579                 parseQuantifier(lastTokenWasAnAtom, 0, UINT_MAX);
580                 lastTokenWasAnAtom = false;
581                 break;
582 
583             case '+':
584                 consume();
585                 parseQuantifier(lastTokenWasAnAtom, 1, UINT_MAX);
586                 lastTokenWasAnAtom = false;
587                 break;
588 
589             case '?':
590                 consume();
591                 parseQuantifier(lastTokenWasAnAtom, 0, 1);
592                 lastTokenWasAnAtom = false;
593                 break;
594 
595             case '{': {
596                 ParseState state = saveState();
597 
598                 consume();
599                 if (peekIsDigit()) {
600                     unsigned min = consumeNumber();
601                     unsigned max = min;
602 
603                     if (tryConsume(','))
604                         max = peekIsDigit() ? consumeNumber() : UINT_MAX;
605 
606                     if (tryConsume('}')) {
607                         if (min <= max)
608                             parseQuantifier(lastTokenWasAnAtom, min, max);
609                         else
610                             m_err = QuantifierOutOfOrder;
611                         lastTokenWasAnAtom = false;
612                         break;
613                     }
614                 }
615 
616                 restoreState(state);
617             } // if we did not find a complete quantifer, fall through to the default case.
618 
619             default:
620                 m_delegate.atomPatternCharacter(consume());
621                 lastTokenWasAnAtom = true;
622             }
623 
624             if (m_err)
625                 return;
626         }
627 
628         if (m_parenthesesNestingDepth > 0)
629             m_err = MissingParentheses;
630     }
631 
632     /*
633      * parse():
634      *
635      * This method calls regexBegin(), calls parseTokens() to parse over the input
636      * patterns, calls regexEnd() or regexError() as appropriate, and converts any
637      * error code to a const char* for a result.
638      */
parse()639     const char* parse()
640     {
641         m_delegate.regexBegin();
642 
643         if (m_size > MAX_PATTERN_SIZE)
644             m_err = PatternTooLarge;
645         else
646             parseTokens();
647         ASSERT(atEndOfPattern() || m_err);
648 
649         if (m_err)
650             m_delegate.regexError();
651         else
652             m_delegate.regexEnd();
653 
654         // The order of this array must match the ErrorCode enum.
655         static const char* errorMessages[NumberOfErrorCodes] = {
656             0, // NoError
657             "regular expression too large",
658             "numbers out of order in {} quantifier",
659             "nothing to repeat",
660             "missing )",
661             "unmatched parentheses",
662             "unrecognized character after (?",
663             "missing terminating ] for character class",
664             "range out of order in character class",
665             "\\ at end of pattern"
666         };
667 
668         return errorMessages[m_err];
669     }
670 
671 
672     // Misc helper functions:
673 
674     typedef unsigned ParseState;
675 
saveState()676     ParseState saveState()
677     {
678         return m_index;
679     }
680 
restoreState(ParseState state)681     void restoreState(ParseState state)
682     {
683         m_index = state;
684     }
685 
atEndOfPattern()686     bool atEndOfPattern()
687     {
688         ASSERT(m_index <= m_size);
689         return m_index == m_size;
690     }
691 
peek()692     int peek()
693     {
694         ASSERT(m_index < m_size);
695         return m_data[m_index];
696     }
697 
peekIsDigit()698     bool peekIsDigit()
699     {
700         return !atEndOfPattern() && WTF::isASCIIDigit(peek());
701     }
702 
peekDigit()703     unsigned peekDigit()
704     {
705         ASSERT(peekIsDigit());
706         return peek() - '0';
707     }
708 
consume()709     int consume()
710     {
711         ASSERT(m_index < m_size);
712         return m_data[m_index++];
713     }
714 
consumeDigit()715     unsigned consumeDigit()
716     {
717         ASSERT(peekIsDigit());
718         return consume() - '0';
719     }
720 
consumeNumber()721     unsigned consumeNumber()
722     {
723         unsigned n = consumeDigit();
724         // check for overflow.
725         for (unsigned newValue; peekIsDigit() && ((newValue = n * 10 + peekDigit()) >= n); ) {
726             n = newValue;
727             consume();
728         }
729         return n;
730     }
731 
consumeOctal()732     unsigned consumeOctal()
733     {
734         ASSERT(WTF::isASCIIOctalDigit(peek()));
735 
736         unsigned n = consumeDigit();
737         while (n < 32 && !atEndOfPattern() && WTF::isASCIIOctalDigit(peek()))
738             n = n * 8 + consumeDigit();
739         return n;
740     }
741 
tryConsume(UChar ch)742     bool tryConsume(UChar ch)
743     {
744         if (atEndOfPattern() || (m_data[m_index] != ch))
745             return false;
746         ++m_index;
747         return true;
748     }
749 
tryConsumeHex(int count)750     int tryConsumeHex(int count)
751     {
752         ParseState state = saveState();
753 
754         int n = 0;
755         while (count--) {
756             if (atEndOfPattern() || !WTF::isASCIIHexDigit(peek())) {
757                 restoreState(state);
758                 return -1;
759             }
760             n = (n << 4) | WTF::toASCIIHexValue(consume());
761         }
762         return n;
763     }
764 
765     Delegate& m_delegate;
766     unsigned m_backReferenceLimit;
767     ErrorCode m_err;
768     const UChar* m_data;
769     unsigned m_size;
770     unsigned m_index;
771     unsigned m_parenthesesNestingDepth;
772 
773     // Derived by empirical testing of compile time in PCRE and WREC.
774     static const unsigned MAX_PATTERN_SIZE = 1024 * 1024;
775 };
776 
777 /*
778  * Yarr::parse():
779  *
780  * The parse method is passed a pattern to be parsed and a delegate upon which
781  * callbacks will be made to record the parsed tokens forming the regex.
782  * Yarr::parse() returns null on success, or a const C string providing an error
783  * message where a parse error occurs.
784  *
785  * The Delegate must implement the following interface:
786  *
787  *    void assertionBOL();
788  *    void assertionEOL();
789  *    void assertionWordBoundary(bool invert);
790  *
791  *    void atomPatternCharacter(UChar ch);
792  *    void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert);
793  *    void atomCharacterClassBegin(bool invert)
794  *    void atomCharacterClassAtom(UChar ch)
795  *    void atomCharacterClassRange(UChar begin, UChar end)
796  *    void atomCharacterClassBuiltIn(BuiltInCharacterClassID classID, bool invert)
797  *    void atomCharacterClassEnd()
798  *    void atomParenthesesSubpatternBegin(bool capture = true);
799  *    void atomParentheticalAssertionBegin(bool invert = false);
800  *    void atomParenthesesEnd();
801  *    void atomBackReference(unsigned subpatternId);
802  *
803  *    void quantifyAtom(unsigned min, unsigned max, bool greedy);
804  *
805  *    void disjunction();
806  *
807  *    void regexBegin();
808  *    void regexEnd();
809  *    void regexError();
810  *
811  * Before any call recording tokens are made, regexBegin() will be called on the
812  * delegate once.  Once parsing is complete either regexEnd() or regexError() will
813  * be called, as appropriate.
814  *
815  * The regular expression is described by a sequence of assertion*() and atom*()
816  * callbacks to the delegate, describing the terms in the regular expression.
817  * Following an atom a quantifyAtom() call may occur to indicate that the previous
818  * atom should be quantified.  In the case of atoms described across multiple
819  * calls (parentheses and character classes) the call to quantifyAtom() will come
820  * after the call to the atom*End() method, never after atom*Begin().
821  *
822  * Character classes may either be described by a single call to
823  * atomBuiltInCharacterClass(), or by a sequence of atomCharacterClass*() calls.
824  * In the latter case, ...Begin() will be called, followed by a sequence of
825  * calls to ...Atom(), ...Range(), and ...BuiltIn(), followed by a call to ...End().
826  *
827  * Sequences of atoms and assertions are broken into alternatives via calls to
828  * disjunction().  Assertions, atoms, and disjunctions emitted between calls to
829  * atomParenthesesBegin() and atomParenthesesEnd() form the body of a subpattern.
830  * atomParenthesesBegin() is passed a subpatternId.  In the case of a regular
831  * capturing subpattern, this will be the subpatternId associated with these
832  * parentheses, and will also by definition be the lowest subpatternId of these
833  * parentheses and of any nested paretheses.  The atomParenthesesEnd() method
834  * is passed the subpatternId of the last capturing subexpression nested within
835  * these paretheses.  In the case of a capturing subpattern with no nested
836  * capturing subpatterns, the same subpatternId will be passed to the begin and
837  * end functions.  In the case of non-capturing subpatterns the subpatternId
838  * passed to the begin method is also the first possible subpatternId that might
839  * be nested within these paretheses.  If a set of non-capturing parentheses does
840  * not contain any capturing subpatterns, then the subpatternId passed to begin
841  * will be greater than the subpatternId passed to end.
842  */
843 
844 template<class Delegate>
845 const char* parse(Delegate& delegate, const UString& pattern, unsigned backReferenceLimit = UINT_MAX)
846 {
847     return Parser<Delegate>(delegate, pattern, backReferenceLimit).parse();
848 }
849 
850 } } // namespace JSC::Yarr
851 
852 #endif
853 
854 #endif // RegexParser_h
855