1 /* 2 * Copyright (C) 2009 Apple Inc. All rights reserved. 3 * 4 * Redistribution and use in source and binary forms, with or without 5 * modification, are permitted provided that the following conditions 6 * are met: 7 * 1. Redistributions of source code must retain the above copyright 8 * notice, this list of conditions and the following disclaimer. 9 * 2. Redistributions in binary form must reproduce the above copyright 10 * notice, this list of conditions and the following disclaimer in the 11 * documentation and/or other materials provided with the distribution. 12 * 13 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY 14 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 16 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR 17 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 18 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 19 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 20 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 21 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 23 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 24 */ 25 26 #ifndef RegexParser_h 27 #define RegexParser_h 28 29 #include <wtf/Platform.h> 30 31 #if ENABLE(YARR) 32 33 #include <UString.h> 34 #include <wtf/ASCIICType.h> 35 #include <wtf/unicode/Unicode.h> 36 #include <limits.h> 37 38 namespace JSC { namespace Yarr { 39 40 enum BuiltInCharacterClassID { 41 DigitClassID, 42 SpaceClassID, 43 WordClassID, 44 NewlineClassID, 45 }; 46 47 // The Parser class should not be used directly - only via the Yarr::parse() method. 48 template<class Delegate> 49 class Parser { 50 private: 51 template<class FriendDelegate> 52 friend const char* parse(FriendDelegate& delegate, const UString& pattern, unsigned backReferenceLimit); 53 54 enum ErrorCode { 55 NoError, 56 PatternTooLarge, 57 QuantifierOutOfOrder, 58 QuantifierWithoutAtom, 59 MissingParentheses, 60 ParenthesesUnmatched, 61 ParenthesesTypeInvalid, 62 CharacterClassUnmatched, 63 CharacterClassOutOfOrder, 64 EscapeUnterminated, 65 NumberOfErrorCodes 66 }; 67 68 /* 69 * CharacterClassParserDelegate: 70 * 71 * The class CharacterClassParserDelegate is used in the parsing of character 72 * classes. This class handles detection of character ranges. This class 73 * implements enough of the delegate interface such that it can be passed to 74 * parseEscape() as an EscapeDelegate. This allows parseEscape() to be reused 75 * to perform the parsing of escape characters in character sets. 76 */ 77 class CharacterClassParserDelegate { 78 public: CharacterClassParserDelegate(Delegate & delegate,ErrorCode & err)79 CharacterClassParserDelegate(Delegate& delegate, ErrorCode& err) 80 : m_delegate(delegate) 81 , m_err(err) 82 , m_state(empty) 83 { 84 } 85 86 /* 87 * begin(): 88 * 89 * Called at beginning of construction. 90 */ begin(bool invert)91 void begin(bool invert) 92 { 93 m_delegate.atomCharacterClassBegin(invert); 94 } 95 96 /* 97 * atomPatternCharacterUnescaped(): 98 * 99 * This method is called directly from parseCharacterClass(), to report a new 100 * pattern character token. This method differs from atomPatternCharacter(), 101 * which will be called from parseEscape(), since a hypen provided via this 102 * method may be indicating a character range, but a hyphen parsed by 103 * parseEscape() cannot be interpreted as doing so. 104 */ atomPatternCharacterUnescaped(UChar ch)105 void atomPatternCharacterUnescaped(UChar ch) 106 { 107 switch (m_state) { 108 case empty: 109 m_character = ch; 110 m_state = cachedCharacter; 111 break; 112 113 case cachedCharacter: 114 if (ch == '-') 115 m_state = cachedCharacterHyphen; 116 else { 117 m_delegate.atomCharacterClassAtom(m_character); 118 m_character = ch; 119 } 120 break; 121 122 case cachedCharacterHyphen: 123 if (ch >= m_character) 124 m_delegate.atomCharacterClassRange(m_character, ch); 125 else 126 m_err = CharacterClassOutOfOrder; 127 m_state = empty; 128 } 129 } 130 131 /* 132 * atomPatternCharacter(): 133 * 134 * Adds a pattern character, called by parseEscape(), as such will not 135 * interpret a hyphen as indicating a character range. 136 */ atomPatternCharacter(UChar ch)137 void atomPatternCharacter(UChar ch) 138 { 139 // Flush if a character is already pending to prevent the 140 // hyphen from begin interpreted as indicating a range. 141 if((ch == '-') && (m_state == cachedCharacter)) 142 flush(); 143 144 atomPatternCharacterUnescaped(ch); 145 } 146 147 /* 148 * atomBuiltInCharacterClass(): 149 * 150 * Adds a built-in character class, called by parseEscape(). 151 */ atomBuiltInCharacterClass(BuiltInCharacterClassID classID,bool invert)152 void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert) 153 { 154 flush(); 155 m_delegate.atomCharacterClassBuiltIn(classID, invert); 156 } 157 158 /* 159 * end(): 160 * 161 * Called at end of construction. 162 */ end()163 void end() 164 { 165 flush(); 166 m_delegate.atomCharacterClassEnd(); 167 } 168 169 // parseEscape() should never call these delegate methods when 170 // invoked with inCharacterClass set. assertionWordBoundary(bool)171 void assertionWordBoundary(bool) { ASSERT_NOT_REACHED(); } atomBackReference(unsigned)172 void atomBackReference(unsigned) { ASSERT_NOT_REACHED(); } 173 174 private: flush()175 void flush() 176 { 177 if (m_state != empty) // either cachedCharacter or cachedCharacterHyphen 178 m_delegate.atomCharacterClassAtom(m_character); 179 if (m_state == cachedCharacterHyphen) 180 m_delegate.atomCharacterClassAtom('-'); 181 m_state = empty; 182 } 183 184 Delegate& m_delegate; 185 ErrorCode& m_err; 186 enum CharacterClassConstructionState { 187 empty, 188 cachedCharacter, 189 cachedCharacterHyphen, 190 } m_state; 191 UChar m_character; 192 }; 193 Parser(Delegate & delegate,const UString & pattern,unsigned backReferenceLimit)194 Parser(Delegate& delegate, const UString& pattern, unsigned backReferenceLimit) 195 : m_delegate(delegate) 196 , m_backReferenceLimit(backReferenceLimit) 197 , m_err(NoError) 198 , m_data(pattern.data()) 199 , m_size(pattern.size()) 200 , m_index(0) 201 , m_parenthesesNestingDepth(0) 202 { 203 } 204 205 /* 206 * parseEscape(): 207 * 208 * Helper for parseTokens() AND parseCharacterClass(). 209 * Unlike the other parser methods, this function does not report tokens 210 * directly to the member delegate (m_delegate), instead tokens are 211 * emitted to the delegate provided as an argument. In the case of atom 212 * escapes, parseTokens() will call parseEscape() passing m_delegate as 213 * an argument, and as such the escape will be reported to the delegate. 214 * 215 * However this method may also be used by parseCharacterClass(), in which 216 * case a CharacterClassParserDelegate will be passed as the delegate that 217 * tokens should be added to. A boolean flag is also provided to indicate 218 * whether that an escape in a CharacterClass is being parsed (some parsing 219 * rules change in this context). 220 * 221 * The boolean value returned by this method indicates whether the token 222 * parsed was an atom (outside of a characted class \b and \B will be 223 * interpreted as assertions). 224 */ 225 template<bool inCharacterClass, class EscapeDelegate> parseEscape(EscapeDelegate & delegate)226 bool parseEscape(EscapeDelegate& delegate) 227 { 228 ASSERT(!m_err); 229 ASSERT(peek() == '\\'); 230 consume(); 231 232 if (atEndOfPattern()) { 233 m_err = EscapeUnterminated; 234 return false; 235 } 236 237 switch (peek()) { 238 // Assertions 239 case 'b': 240 consume(); 241 if (inCharacterClass) 242 delegate.atomPatternCharacter('\b'); 243 else { 244 delegate.assertionWordBoundary(false); 245 return false; 246 } 247 break; 248 case 'B': 249 consume(); 250 if (inCharacterClass) 251 delegate.atomPatternCharacter('B'); 252 else { 253 delegate.assertionWordBoundary(true); 254 return false; 255 } 256 break; 257 258 // CharacterClassEscape 259 case 'd': 260 consume(); 261 delegate.atomBuiltInCharacterClass(DigitClassID, false); 262 break; 263 case 's': 264 consume(); 265 delegate.atomBuiltInCharacterClass(SpaceClassID, false); 266 break; 267 case 'w': 268 consume(); 269 delegate.atomBuiltInCharacterClass(WordClassID, false); 270 break; 271 case 'D': 272 consume(); 273 delegate.atomBuiltInCharacterClass(DigitClassID, true); 274 break; 275 case 'S': 276 consume(); 277 delegate.atomBuiltInCharacterClass(SpaceClassID, true); 278 break; 279 case 'W': 280 consume(); 281 delegate.atomBuiltInCharacterClass(WordClassID, true); 282 break; 283 284 // DecimalEscape 285 case '1': 286 case '2': 287 case '3': 288 case '4': 289 case '5': 290 case '6': 291 case '7': 292 case '8': 293 case '9': { 294 // To match Firefox, we parse an invalid backreference in the range [1-7] as an octal escape. 295 // First, try to parse this as backreference. 296 if (!inCharacterClass) { 297 ParseState state = saveState(); 298 299 unsigned backReference = consumeNumber(); 300 if (backReference <= m_backReferenceLimit) { 301 delegate.atomBackReference(backReference); 302 break; 303 } 304 305 restoreState(state); 306 } 307 308 // Not a backreference, and not octal. 309 if (peek() >= '8') { 310 delegate.atomPatternCharacter('\\'); 311 break; 312 } 313 314 // Fall-through to handle this as an octal escape. 315 } 316 317 // Octal escape 318 case '0': 319 delegate.atomPatternCharacter(consumeOctal()); 320 break; 321 322 // ControlEscape 323 case 'f': 324 consume(); 325 delegate.atomPatternCharacter('\f'); 326 break; 327 case 'n': 328 consume(); 329 delegate.atomPatternCharacter('\n'); 330 break; 331 case 'r': 332 consume(); 333 delegate.atomPatternCharacter('\r'); 334 break; 335 case 't': 336 consume(); 337 delegate.atomPatternCharacter('\t'); 338 break; 339 case 'v': 340 consume(); 341 delegate.atomPatternCharacter('\v'); 342 break; 343 344 // ControlLetter 345 case 'c': { 346 ParseState state = saveState(); 347 consume(); 348 if (!atEndOfPattern()) { 349 int control = consume(); 350 351 // To match Firefox, inside a character class, we also accept numbers and '_' as control characters. 352 if (inCharacterClass ? WTF::isASCIIAlphanumeric(control) || (control == '_') : WTF::isASCIIAlpha(control)) { 353 delegate.atomPatternCharacter(control & 0x1f); 354 break; 355 } 356 } 357 restoreState(state); 358 delegate.atomPatternCharacter('\\'); 359 break; 360 } 361 362 // HexEscape 363 case 'x': { 364 consume(); 365 int x = tryConsumeHex(2); 366 if (x == -1) 367 delegate.atomPatternCharacter('x'); 368 else 369 delegate.atomPatternCharacter(x); 370 break; 371 } 372 373 // UnicodeEscape 374 case 'u': { 375 consume(); 376 int u = tryConsumeHex(4); 377 if (u == -1) 378 delegate.atomPatternCharacter('u'); 379 else 380 delegate.atomPatternCharacter(u); 381 break; 382 } 383 384 // IdentityEscape 385 default: 386 delegate.atomPatternCharacter(consume()); 387 } 388 389 return true; 390 } 391 392 /* 393 * parseAtomEscape(), parseCharacterClassEscape(): 394 * 395 * These methods alias to parseEscape(). 396 */ parseAtomEscape()397 bool parseAtomEscape() 398 { 399 return parseEscape<false>(m_delegate); 400 } parseCharacterClassEscape(CharacterClassParserDelegate & delegate)401 void parseCharacterClassEscape(CharacterClassParserDelegate& delegate) 402 { 403 parseEscape<true>(delegate); 404 } 405 406 /* 407 * parseCharacterClass(): 408 * 409 * Helper for parseTokens(); calls dirctly and indirectly (via parseCharacterClassEscape) 410 * to an instance of CharacterClassParserDelegate, to describe the character class to the 411 * delegate. 412 */ parseCharacterClass()413 void parseCharacterClass() 414 { 415 ASSERT(!m_err); 416 ASSERT(peek() == '['); 417 consume(); 418 419 CharacterClassParserDelegate characterClassConstructor(m_delegate, m_err); 420 421 characterClassConstructor.begin(tryConsume('^')); 422 423 while (!atEndOfPattern()) { 424 switch (peek()) { 425 case ']': 426 consume(); 427 characterClassConstructor.end(); 428 return; 429 430 case '\\': 431 parseCharacterClassEscape(characterClassConstructor); 432 break; 433 434 default: 435 characterClassConstructor.atomPatternCharacterUnescaped(consume()); 436 } 437 438 if (m_err) 439 return; 440 } 441 442 m_err = CharacterClassUnmatched; 443 } 444 445 /* 446 * parseParenthesesBegin(): 447 * 448 * Helper for parseTokens(); checks for parentheses types other than regular capturing subpatterns. 449 */ parseParenthesesBegin()450 void parseParenthesesBegin() 451 { 452 ASSERT(!m_err); 453 ASSERT(peek() == '('); 454 consume(); 455 456 if (tryConsume('?')) { 457 if (atEndOfPattern()) { 458 m_err = ParenthesesTypeInvalid; 459 return; 460 } 461 462 switch (consume()) { 463 case ':': 464 m_delegate.atomParenthesesSubpatternBegin(false); 465 break; 466 467 case '=': 468 m_delegate.atomParentheticalAssertionBegin(); 469 break; 470 471 case '!': 472 m_delegate.atomParentheticalAssertionBegin(true); 473 break; 474 475 default: 476 m_err = ParenthesesTypeInvalid; 477 } 478 } else 479 m_delegate.atomParenthesesSubpatternBegin(); 480 481 ++m_parenthesesNestingDepth; 482 } 483 484 /* 485 * parseParenthesesEnd(): 486 * 487 * Helper for parseTokens(); checks for parse errors (due to unmatched parentheses). 488 */ parseParenthesesEnd()489 void parseParenthesesEnd() 490 { 491 ASSERT(!m_err); 492 ASSERT(peek() == ')'); 493 consume(); 494 495 if (m_parenthesesNestingDepth > 0) 496 m_delegate.atomParenthesesEnd(); 497 else 498 m_err = ParenthesesUnmatched; 499 500 --m_parenthesesNestingDepth; 501 } 502 503 /* 504 * parseQuantifier(): 505 * 506 * Helper for parseTokens(); checks for parse errors and non-greedy quantifiers. 507 */ parseQuantifier(bool lastTokenWasAnAtom,unsigned min,unsigned max)508 void parseQuantifier(bool lastTokenWasAnAtom, unsigned min, unsigned max) 509 { 510 ASSERT(!m_err); 511 ASSERT(min <= max); 512 513 if (lastTokenWasAnAtom) 514 m_delegate.quantifyAtom(min, max, !tryConsume('?')); 515 else 516 m_err = QuantifierWithoutAtom; 517 } 518 519 /* 520 * parseTokens(): 521 * 522 * This method loops over the input pattern reporting tokens to the delegate. 523 * The method returns when a parse error is detected, or the end of the pattern 524 * is reached. One piece of state is tracked around the loop, which is whether 525 * the last token passed to the delegate was an atom (this is necessary to detect 526 * a parse error when a quantifier provided without an atom to quantify). 527 */ parseTokens()528 void parseTokens() 529 { 530 bool lastTokenWasAnAtom = false; 531 532 while (!atEndOfPattern()) { 533 switch (peek()) { 534 case '|': 535 consume(); 536 m_delegate.disjunction(); 537 lastTokenWasAnAtom = false; 538 break; 539 540 case '(': 541 parseParenthesesBegin(); 542 lastTokenWasAnAtom = false; 543 break; 544 545 case ')': 546 parseParenthesesEnd(); 547 lastTokenWasAnAtom = true; 548 break; 549 550 case '^': 551 consume(); 552 m_delegate.assertionBOL(); 553 lastTokenWasAnAtom = false; 554 break; 555 556 case '$': 557 consume(); 558 m_delegate.assertionEOL(); 559 lastTokenWasAnAtom = false; 560 break; 561 562 case '.': 563 consume(); 564 m_delegate.atomBuiltInCharacterClass(NewlineClassID, true); 565 lastTokenWasAnAtom = true; 566 break; 567 568 case '[': 569 parseCharacterClass(); 570 lastTokenWasAnAtom = true; 571 break; 572 573 case '\\': 574 lastTokenWasAnAtom = parseAtomEscape(); 575 break; 576 577 case '*': 578 consume(); 579 parseQuantifier(lastTokenWasAnAtom, 0, UINT_MAX); 580 lastTokenWasAnAtom = false; 581 break; 582 583 case '+': 584 consume(); 585 parseQuantifier(lastTokenWasAnAtom, 1, UINT_MAX); 586 lastTokenWasAnAtom = false; 587 break; 588 589 case '?': 590 consume(); 591 parseQuantifier(lastTokenWasAnAtom, 0, 1); 592 lastTokenWasAnAtom = false; 593 break; 594 595 case '{': { 596 ParseState state = saveState(); 597 598 consume(); 599 if (peekIsDigit()) { 600 unsigned min = consumeNumber(); 601 unsigned max = min; 602 603 if (tryConsume(',')) 604 max = peekIsDigit() ? consumeNumber() : UINT_MAX; 605 606 if (tryConsume('}')) { 607 if (min <= max) 608 parseQuantifier(lastTokenWasAnAtom, min, max); 609 else 610 m_err = QuantifierOutOfOrder; 611 lastTokenWasAnAtom = false; 612 break; 613 } 614 } 615 616 restoreState(state); 617 } // if we did not find a complete quantifer, fall through to the default case. 618 619 default: 620 m_delegate.atomPatternCharacter(consume()); 621 lastTokenWasAnAtom = true; 622 } 623 624 if (m_err) 625 return; 626 } 627 628 if (m_parenthesesNestingDepth > 0) 629 m_err = MissingParentheses; 630 } 631 632 /* 633 * parse(): 634 * 635 * This method calls regexBegin(), calls parseTokens() to parse over the input 636 * patterns, calls regexEnd() or regexError() as appropriate, and converts any 637 * error code to a const char* for a result. 638 */ parse()639 const char* parse() 640 { 641 m_delegate.regexBegin(); 642 643 if (m_size > MAX_PATTERN_SIZE) 644 m_err = PatternTooLarge; 645 else 646 parseTokens(); 647 ASSERT(atEndOfPattern() || m_err); 648 649 if (m_err) 650 m_delegate.regexError(); 651 else 652 m_delegate.regexEnd(); 653 654 // The order of this array must match the ErrorCode enum. 655 static const char* errorMessages[NumberOfErrorCodes] = { 656 0, // NoError 657 "regular expression too large", 658 "numbers out of order in {} quantifier", 659 "nothing to repeat", 660 "missing )", 661 "unmatched parentheses", 662 "unrecognized character after (?", 663 "missing terminating ] for character class", 664 "range out of order in character class", 665 "\\ at end of pattern" 666 }; 667 668 return errorMessages[m_err]; 669 } 670 671 672 // Misc helper functions: 673 674 typedef unsigned ParseState; 675 saveState()676 ParseState saveState() 677 { 678 return m_index; 679 } 680 restoreState(ParseState state)681 void restoreState(ParseState state) 682 { 683 m_index = state; 684 } 685 atEndOfPattern()686 bool atEndOfPattern() 687 { 688 ASSERT(m_index <= m_size); 689 return m_index == m_size; 690 } 691 peek()692 int peek() 693 { 694 ASSERT(m_index < m_size); 695 return m_data[m_index]; 696 } 697 peekIsDigit()698 bool peekIsDigit() 699 { 700 return !atEndOfPattern() && WTF::isASCIIDigit(peek()); 701 } 702 peekDigit()703 unsigned peekDigit() 704 { 705 ASSERT(peekIsDigit()); 706 return peek() - '0'; 707 } 708 consume()709 int consume() 710 { 711 ASSERT(m_index < m_size); 712 return m_data[m_index++]; 713 } 714 consumeDigit()715 unsigned consumeDigit() 716 { 717 ASSERT(peekIsDigit()); 718 return consume() - '0'; 719 } 720 consumeNumber()721 unsigned consumeNumber() 722 { 723 unsigned n = consumeDigit(); 724 // check for overflow. 725 for (unsigned newValue; peekIsDigit() && ((newValue = n * 10 + peekDigit()) >= n); ) { 726 n = newValue; 727 consume(); 728 } 729 return n; 730 } 731 consumeOctal()732 unsigned consumeOctal() 733 { 734 ASSERT(WTF::isASCIIOctalDigit(peek())); 735 736 unsigned n = consumeDigit(); 737 while (n < 32 && !atEndOfPattern() && WTF::isASCIIOctalDigit(peek())) 738 n = n * 8 + consumeDigit(); 739 return n; 740 } 741 tryConsume(UChar ch)742 bool tryConsume(UChar ch) 743 { 744 if (atEndOfPattern() || (m_data[m_index] != ch)) 745 return false; 746 ++m_index; 747 return true; 748 } 749 tryConsumeHex(int count)750 int tryConsumeHex(int count) 751 { 752 ParseState state = saveState(); 753 754 int n = 0; 755 while (count--) { 756 if (atEndOfPattern() || !WTF::isASCIIHexDigit(peek())) { 757 restoreState(state); 758 return -1; 759 } 760 n = (n << 4) | WTF::toASCIIHexValue(consume()); 761 } 762 return n; 763 } 764 765 Delegate& m_delegate; 766 unsigned m_backReferenceLimit; 767 ErrorCode m_err; 768 const UChar* m_data; 769 unsigned m_size; 770 unsigned m_index; 771 unsigned m_parenthesesNestingDepth; 772 773 // Derived by empirical testing of compile time in PCRE and WREC. 774 static const unsigned MAX_PATTERN_SIZE = 1024 * 1024; 775 }; 776 777 /* 778 * Yarr::parse(): 779 * 780 * The parse method is passed a pattern to be parsed and a delegate upon which 781 * callbacks will be made to record the parsed tokens forming the regex. 782 * Yarr::parse() returns null on success, or a const C string providing an error 783 * message where a parse error occurs. 784 * 785 * The Delegate must implement the following interface: 786 * 787 * void assertionBOL(); 788 * void assertionEOL(); 789 * void assertionWordBoundary(bool invert); 790 * 791 * void atomPatternCharacter(UChar ch); 792 * void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert); 793 * void atomCharacterClassBegin(bool invert) 794 * void atomCharacterClassAtom(UChar ch) 795 * void atomCharacterClassRange(UChar begin, UChar end) 796 * void atomCharacterClassBuiltIn(BuiltInCharacterClassID classID, bool invert) 797 * void atomCharacterClassEnd() 798 * void atomParenthesesSubpatternBegin(bool capture = true); 799 * void atomParentheticalAssertionBegin(bool invert = false); 800 * void atomParenthesesEnd(); 801 * void atomBackReference(unsigned subpatternId); 802 * 803 * void quantifyAtom(unsigned min, unsigned max, bool greedy); 804 * 805 * void disjunction(); 806 * 807 * void regexBegin(); 808 * void regexEnd(); 809 * void regexError(); 810 * 811 * Before any call recording tokens are made, regexBegin() will be called on the 812 * delegate once. Once parsing is complete either regexEnd() or regexError() will 813 * be called, as appropriate. 814 * 815 * The regular expression is described by a sequence of assertion*() and atom*() 816 * callbacks to the delegate, describing the terms in the regular expression. 817 * Following an atom a quantifyAtom() call may occur to indicate that the previous 818 * atom should be quantified. In the case of atoms described across multiple 819 * calls (parentheses and character classes) the call to quantifyAtom() will come 820 * after the call to the atom*End() method, never after atom*Begin(). 821 * 822 * Character classes may either be described by a single call to 823 * atomBuiltInCharacterClass(), or by a sequence of atomCharacterClass*() calls. 824 * In the latter case, ...Begin() will be called, followed by a sequence of 825 * calls to ...Atom(), ...Range(), and ...BuiltIn(), followed by a call to ...End(). 826 * 827 * Sequences of atoms and assertions are broken into alternatives via calls to 828 * disjunction(). Assertions, atoms, and disjunctions emitted between calls to 829 * atomParenthesesBegin() and atomParenthesesEnd() form the body of a subpattern. 830 * atomParenthesesBegin() is passed a subpatternId. In the case of a regular 831 * capturing subpattern, this will be the subpatternId associated with these 832 * parentheses, and will also by definition be the lowest subpatternId of these 833 * parentheses and of any nested paretheses. The atomParenthesesEnd() method 834 * is passed the subpatternId of the last capturing subexpression nested within 835 * these paretheses. In the case of a capturing subpattern with no nested 836 * capturing subpatterns, the same subpatternId will be passed to the begin and 837 * end functions. In the case of non-capturing subpatterns the subpatternId 838 * passed to the begin method is also the first possible subpatternId that might 839 * be nested within these paretheses. If a set of non-capturing parentheses does 840 * not contain any capturing subpatterns, then the subpatternId passed to begin 841 * will be greater than the subpatternId passed to end. 842 */ 843 844 template<class Delegate> 845 const char* parse(Delegate& delegate, const UString& pattern, unsigned backReferenceLimit = UINT_MAX) 846 { 847 return Parser<Delegate>(delegate, pattern, backReferenceLimit).parse(); 848 } 849 850 } } // namespace JSC::Yarr 851 852 #endif 853 854 #endif // RegexParser_h 855