1 /* 2 * Copyright (C) 2009 Apple Inc. All rights reserved. 3 * 4 * Redistribution and use in source and binary forms, with or without 5 * modification, are permitted provided that the following conditions 6 * are met: 7 * 1. Redistributions of source code must retain the above copyright 8 * notice, this list of conditions and the following disclaimer. 9 * 2. Redistributions in binary form must reproduce the above copyright 10 * notice, this list of conditions and the following disclaimer in the 11 * documentation and/or other materials provided with the distribution. 12 * 13 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY 14 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 16 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR 17 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 18 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 19 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 20 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 21 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 23 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 24 */ 25 26 #ifndef YarrParser_h 27 #define YarrParser_h 28 29 #include <runtime/UString.h> 30 #include "Yarr.h" 31 #include <wtf/ASCIICType.h> 32 #include <wtf/unicode/Unicode.h> 33 34 namespace JSC { namespace Yarr { 35 36 #define REGEXP_ERROR_PREFIX "Invalid regular expression: " 37 38 enum BuiltInCharacterClassID { 39 DigitClassID, 40 SpaceClassID, 41 WordClassID, 42 NewlineClassID, 43 }; 44 45 // The Parser class should not be used directly - only via the Yarr::parse() method. 46 template<class Delegate> 47 class Parser { 48 private: 49 template<class FriendDelegate> 50 friend const char* parse(FriendDelegate& delegate, const UString& pattern, unsigned backReferenceLimit); 51 52 enum ErrorCode { 53 NoError, 54 PatternTooLarge, 55 QuantifierOutOfOrder, 56 QuantifierWithoutAtom, 57 MissingParentheses, 58 ParenthesesUnmatched, 59 ParenthesesTypeInvalid, 60 CharacterClassUnmatched, 61 CharacterClassOutOfOrder, 62 EscapeUnterminated, 63 NumberOfErrorCodes 64 }; 65 66 /* 67 * CharacterClassParserDelegate: 68 * 69 * The class CharacterClassParserDelegate is used in the parsing of character 70 * classes. This class handles detection of character ranges. This class 71 * implements enough of the delegate interface such that it can be passed to 72 * parseEscape() as an EscapeDelegate. This allows parseEscape() to be reused 73 * to perform the parsing of escape characters in character sets. 74 */ 75 class CharacterClassParserDelegate { 76 public: CharacterClassParserDelegate(Delegate & delegate,ErrorCode & err)77 CharacterClassParserDelegate(Delegate& delegate, ErrorCode& err) 78 : m_delegate(delegate) 79 , m_err(err) 80 , m_state(Empty) 81 , m_character(0) 82 { 83 } 84 85 /* 86 * begin(): 87 * 88 * Called at beginning of construction. 89 */ begin(bool invert)90 void begin(bool invert) 91 { 92 m_delegate.atomCharacterClassBegin(invert); 93 } 94 95 /* 96 * atomPatternCharacter(): 97 * 98 * This method is called either from parseCharacterClass() (for an unescaped 99 * character in a character class), or from parseEscape(). In the former case 100 * the value true will be passed for the argument 'hyphenIsRange', and in this 101 * mode we will allow a hypen to be treated as indicating a range (i.e. /[a-z]/ 102 * is different to /[a\-z]/). 103 */ 104 void atomPatternCharacter(UChar ch, bool hyphenIsRange = false) 105 { 106 switch (m_state) { 107 case AfterCharacterClass: 108 // Following a builtin character class we need look out for a hyphen. 109 // We're looking for invalid ranges, such as /[\d-x]/ or /[\d-\d]/. 110 // If we see a hyphen following a charater class then unlike usual 111 // we'll report it to the delegate immediately, and put ourself into 112 // a poisoned state. Any following calls to add another character or 113 // character class will result in an error. (A hypen following a 114 // character-class is itself valid, but only at the end of a regex). 115 if (hyphenIsRange && ch == '-') { 116 m_delegate.atomCharacterClassAtom('-'); 117 m_state = AfterCharacterClassHyphen; 118 return; 119 } 120 // Otherwise just fall through - cached character so treat this as Empty. 121 122 case Empty: 123 m_character = ch; 124 m_state = CachedCharacter; 125 return; 126 127 case CachedCharacter: 128 if (hyphenIsRange && ch == '-') 129 m_state = CachedCharacterHyphen; 130 else { 131 m_delegate.atomCharacterClassAtom(m_character); 132 m_character = ch; 133 } 134 return; 135 136 case CachedCharacterHyphen: 137 if (ch < m_character) { 138 m_err = CharacterClassOutOfOrder; 139 return; 140 } 141 m_delegate.atomCharacterClassRange(m_character, ch); 142 m_state = Empty; 143 return; 144 145 // See coment in atomBuiltInCharacterClass below. 146 // This too is technically an error, per ECMA-262, and again we 147 // we chose to allow this. Note a subtlely here that while we 148 // diverge from the spec's definition of CharacterRange we do 149 // remain in compliance with the grammar. For example, consider 150 // the expression /[\d-a-z]/. We comply with the grammar in 151 // this case by not allowing a-z to be matched as a range. 152 case AfterCharacterClassHyphen: 153 m_delegate.atomCharacterClassAtom(ch); 154 m_state = Empty; 155 return; 156 } 157 } 158 159 /* 160 * atomBuiltInCharacterClass(): 161 * 162 * Adds a built-in character class, called by parseEscape(). 163 */ atomBuiltInCharacterClass(BuiltInCharacterClassID classID,bool invert)164 void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert) 165 { 166 switch (m_state) { 167 case CachedCharacter: 168 // Flush the currently cached character, then fall through. 169 m_delegate.atomCharacterClassAtom(m_character); 170 171 case Empty: 172 case AfterCharacterClass: 173 m_state = AfterCharacterClass; 174 m_delegate.atomCharacterClassBuiltIn(classID, invert); 175 return; 176 177 // If we hit either of these cases, we have an invalid range that 178 // looks something like /[x-\d]/ or /[\d-\d]/. 179 // According to ECMA-262 this should be a syntax error, but 180 // empirical testing shows this to break teh webz. Instead we 181 // comply with to the ECMA-262 grammar, and assume the grammar to 182 // have matched the range correctly, but tweak our interpretation 183 // of CharacterRange. Effectively we implicitly handle the hyphen 184 // as if it were escaped, e.g. /[\w-_]/ is treated as /[\w\-_]/. 185 case CachedCharacterHyphen: 186 m_delegate.atomCharacterClassAtom(m_character); 187 m_delegate.atomCharacterClassAtom('-'); 188 // fall through 189 case AfterCharacterClassHyphen: 190 m_delegate.atomCharacterClassBuiltIn(classID, invert); 191 m_state = Empty; 192 return; 193 } 194 } 195 196 /* 197 * end(): 198 * 199 * Called at end of construction. 200 */ end()201 void end() 202 { 203 if (m_state == CachedCharacter) 204 m_delegate.atomCharacterClassAtom(m_character); 205 else if (m_state == CachedCharacterHyphen) { 206 m_delegate.atomCharacterClassAtom(m_character); 207 m_delegate.atomCharacterClassAtom('-'); 208 } 209 m_delegate.atomCharacterClassEnd(); 210 } 211 212 // parseEscape() should never call these delegate methods when 213 // invoked with inCharacterClass set. assertionWordBoundary(bool)214 void assertionWordBoundary(bool) { ASSERT_NOT_REACHED(); } atomBackReference(unsigned)215 void atomBackReference(unsigned) { ASSERT_NOT_REACHED(); } 216 217 private: 218 Delegate& m_delegate; 219 ErrorCode& m_err; 220 enum CharacterClassConstructionState { 221 Empty, 222 CachedCharacter, 223 CachedCharacterHyphen, 224 AfterCharacterClass, 225 AfterCharacterClassHyphen, 226 } m_state; 227 UChar m_character; 228 }; 229 Parser(Delegate & delegate,const UString & pattern,unsigned backReferenceLimit)230 Parser(Delegate& delegate, const UString& pattern, unsigned backReferenceLimit) 231 : m_delegate(delegate) 232 , m_backReferenceLimit(backReferenceLimit) 233 , m_err(NoError) 234 , m_data(pattern.characters()) 235 , m_size(pattern.length()) 236 , m_index(0) 237 , m_parenthesesNestingDepth(0) 238 { 239 } 240 241 /* 242 * parseEscape(): 243 * 244 * Helper for parseTokens() AND parseCharacterClass(). 245 * Unlike the other parser methods, this function does not report tokens 246 * directly to the member delegate (m_delegate), instead tokens are 247 * emitted to the delegate provided as an argument. In the case of atom 248 * escapes, parseTokens() will call parseEscape() passing m_delegate as 249 * an argument, and as such the escape will be reported to the delegate. 250 * 251 * However this method may also be used by parseCharacterClass(), in which 252 * case a CharacterClassParserDelegate will be passed as the delegate that 253 * tokens should be added to. A boolean flag is also provided to indicate 254 * whether that an escape in a CharacterClass is being parsed (some parsing 255 * rules change in this context). 256 * 257 * The boolean value returned by this method indicates whether the token 258 * parsed was an atom (outside of a characted class \b and \B will be 259 * interpreted as assertions). 260 */ 261 template<bool inCharacterClass, class EscapeDelegate> parseEscape(EscapeDelegate & delegate)262 bool parseEscape(EscapeDelegate& delegate) 263 { 264 ASSERT(!m_err); 265 ASSERT(peek() == '\\'); 266 consume(); 267 268 if (atEndOfPattern()) { 269 m_err = EscapeUnterminated; 270 return false; 271 } 272 273 switch (peek()) { 274 // Assertions 275 case 'b': 276 consume(); 277 if (inCharacterClass) 278 delegate.atomPatternCharacter('\b'); 279 else { 280 delegate.assertionWordBoundary(false); 281 return false; 282 } 283 break; 284 case 'B': 285 consume(); 286 if (inCharacterClass) 287 delegate.atomPatternCharacter('B'); 288 else { 289 delegate.assertionWordBoundary(true); 290 return false; 291 } 292 break; 293 294 // CharacterClassEscape 295 case 'd': 296 consume(); 297 delegate.atomBuiltInCharacterClass(DigitClassID, false); 298 break; 299 case 's': 300 consume(); 301 delegate.atomBuiltInCharacterClass(SpaceClassID, false); 302 break; 303 case 'w': 304 consume(); 305 delegate.atomBuiltInCharacterClass(WordClassID, false); 306 break; 307 case 'D': 308 consume(); 309 delegate.atomBuiltInCharacterClass(DigitClassID, true); 310 break; 311 case 'S': 312 consume(); 313 delegate.atomBuiltInCharacterClass(SpaceClassID, true); 314 break; 315 case 'W': 316 consume(); 317 delegate.atomBuiltInCharacterClass(WordClassID, true); 318 break; 319 320 // DecimalEscape 321 case '1': 322 case '2': 323 case '3': 324 case '4': 325 case '5': 326 case '6': 327 case '7': 328 case '8': 329 case '9': { 330 // To match Firefox, we parse an invalid backreference in the range [1-7] as an octal escape. 331 // First, try to parse this as backreference. 332 if (!inCharacterClass) { 333 ParseState state = saveState(); 334 335 unsigned backReference = consumeNumber(); 336 if (backReference <= m_backReferenceLimit) { 337 delegate.atomBackReference(backReference); 338 break; 339 } 340 341 restoreState(state); 342 } 343 344 // Not a backreference, and not octal. 345 if (peek() >= '8') { 346 delegate.atomPatternCharacter('\\'); 347 break; 348 } 349 350 // Fall-through to handle this as an octal escape. 351 } 352 353 // Octal escape 354 case '0': 355 delegate.atomPatternCharacter(consumeOctal()); 356 break; 357 358 // ControlEscape 359 case 'f': 360 consume(); 361 delegate.atomPatternCharacter('\f'); 362 break; 363 case 'n': 364 consume(); 365 delegate.atomPatternCharacter('\n'); 366 break; 367 case 'r': 368 consume(); 369 delegate.atomPatternCharacter('\r'); 370 break; 371 case 't': 372 consume(); 373 delegate.atomPatternCharacter('\t'); 374 break; 375 case 'v': 376 consume(); 377 delegate.atomPatternCharacter('\v'); 378 break; 379 380 // ControlLetter 381 case 'c': { 382 ParseState state = saveState(); 383 consume(); 384 if (!atEndOfPattern()) { 385 int control = consume(); 386 387 // To match Firefox, inside a character class, we also accept numbers and '_' as control characters. 388 if (inCharacterClass ? WTF::isASCIIAlphanumeric(control) || (control == '_') : WTF::isASCIIAlpha(control)) { 389 delegate.atomPatternCharacter(control & 0x1f); 390 break; 391 } 392 } 393 restoreState(state); 394 delegate.atomPatternCharacter('\\'); 395 break; 396 } 397 398 // HexEscape 399 case 'x': { 400 consume(); 401 int x = tryConsumeHex(2); 402 if (x == -1) 403 delegate.atomPatternCharacter('x'); 404 else 405 delegate.atomPatternCharacter(x); 406 break; 407 } 408 409 // UnicodeEscape 410 case 'u': { 411 consume(); 412 int u = tryConsumeHex(4); 413 if (u == -1) 414 delegate.atomPatternCharacter('u'); 415 else 416 delegate.atomPatternCharacter(u); 417 break; 418 } 419 420 // IdentityEscape 421 default: 422 delegate.atomPatternCharacter(consume()); 423 } 424 425 return true; 426 } 427 428 /* 429 * parseAtomEscape(), parseCharacterClassEscape(): 430 * 431 * These methods alias to parseEscape(). 432 */ parseAtomEscape()433 bool parseAtomEscape() 434 { 435 return parseEscape<false>(m_delegate); 436 } parseCharacterClassEscape(CharacterClassParserDelegate & delegate)437 void parseCharacterClassEscape(CharacterClassParserDelegate& delegate) 438 { 439 parseEscape<true>(delegate); 440 } 441 442 /* 443 * parseCharacterClass(): 444 * 445 * Helper for parseTokens(); calls dirctly and indirectly (via parseCharacterClassEscape) 446 * to an instance of CharacterClassParserDelegate, to describe the character class to the 447 * delegate. 448 */ parseCharacterClass()449 void parseCharacterClass() 450 { 451 ASSERT(!m_err); 452 ASSERT(peek() == '['); 453 consume(); 454 455 CharacterClassParserDelegate characterClassConstructor(m_delegate, m_err); 456 457 characterClassConstructor.begin(tryConsume('^')); 458 459 while (!atEndOfPattern()) { 460 switch (peek()) { 461 case ']': 462 consume(); 463 characterClassConstructor.end(); 464 return; 465 466 case '\\': 467 parseCharacterClassEscape(characterClassConstructor); 468 break; 469 470 default: 471 characterClassConstructor.atomPatternCharacter(consume(), true); 472 } 473 474 if (m_err) 475 return; 476 } 477 478 m_err = CharacterClassUnmatched; 479 } 480 481 /* 482 * parseParenthesesBegin(): 483 * 484 * Helper for parseTokens(); checks for parentheses types other than regular capturing subpatterns. 485 */ parseParenthesesBegin()486 void parseParenthesesBegin() 487 { 488 ASSERT(!m_err); 489 ASSERT(peek() == '('); 490 consume(); 491 492 if (tryConsume('?')) { 493 if (atEndOfPattern()) { 494 m_err = ParenthesesTypeInvalid; 495 return; 496 } 497 498 switch (consume()) { 499 case ':': 500 m_delegate.atomParenthesesSubpatternBegin(false); 501 break; 502 503 case '=': 504 m_delegate.atomParentheticalAssertionBegin(); 505 break; 506 507 case '!': 508 m_delegate.atomParentheticalAssertionBegin(true); 509 break; 510 511 default: 512 m_err = ParenthesesTypeInvalid; 513 } 514 } else 515 m_delegate.atomParenthesesSubpatternBegin(); 516 517 ++m_parenthesesNestingDepth; 518 } 519 520 /* 521 * parseParenthesesEnd(): 522 * 523 * Helper for parseTokens(); checks for parse errors (due to unmatched parentheses). 524 */ parseParenthesesEnd()525 void parseParenthesesEnd() 526 { 527 ASSERT(!m_err); 528 ASSERT(peek() == ')'); 529 consume(); 530 531 if (m_parenthesesNestingDepth > 0) 532 m_delegate.atomParenthesesEnd(); 533 else 534 m_err = ParenthesesUnmatched; 535 536 --m_parenthesesNestingDepth; 537 } 538 539 /* 540 * parseQuantifier(): 541 * 542 * Helper for parseTokens(); checks for parse errors and non-greedy quantifiers. 543 */ parseQuantifier(bool lastTokenWasAnAtom,unsigned min,unsigned max)544 void parseQuantifier(bool lastTokenWasAnAtom, unsigned min, unsigned max) 545 { 546 ASSERT(!m_err); 547 ASSERT(min <= max); 548 549 if (lastTokenWasAnAtom) 550 m_delegate.quantifyAtom(min, max, !tryConsume('?')); 551 else 552 m_err = QuantifierWithoutAtom; 553 } 554 555 /* 556 * parseTokens(): 557 * 558 * This method loops over the input pattern reporting tokens to the delegate. 559 * The method returns when a parse error is detected, or the end of the pattern 560 * is reached. One piece of state is tracked around the loop, which is whether 561 * the last token passed to the delegate was an atom (this is necessary to detect 562 * a parse error when a quantifier provided without an atom to quantify). 563 */ parseTokens()564 void parseTokens() 565 { 566 bool lastTokenWasAnAtom = false; 567 568 while (!atEndOfPattern()) { 569 switch (peek()) { 570 case '|': 571 consume(); 572 m_delegate.disjunction(); 573 lastTokenWasAnAtom = false; 574 break; 575 576 case '(': 577 parseParenthesesBegin(); 578 lastTokenWasAnAtom = false; 579 break; 580 581 case ')': 582 parseParenthesesEnd(); 583 lastTokenWasAnAtom = true; 584 break; 585 586 case '^': 587 consume(); 588 m_delegate.assertionBOL(); 589 lastTokenWasAnAtom = false; 590 break; 591 592 case '$': 593 consume(); 594 m_delegate.assertionEOL(); 595 lastTokenWasAnAtom = false; 596 break; 597 598 case '.': 599 consume(); 600 m_delegate.atomBuiltInCharacterClass(NewlineClassID, true); 601 lastTokenWasAnAtom = true; 602 break; 603 604 case '[': 605 parseCharacterClass(); 606 lastTokenWasAnAtom = true; 607 break; 608 609 case '\\': 610 lastTokenWasAnAtom = parseAtomEscape(); 611 break; 612 613 case '*': 614 consume(); 615 parseQuantifier(lastTokenWasAnAtom, 0, quantifyInfinite); 616 lastTokenWasAnAtom = false; 617 break; 618 619 case '+': 620 consume(); 621 parseQuantifier(lastTokenWasAnAtom, 1, quantifyInfinite); 622 lastTokenWasAnAtom = false; 623 break; 624 625 case '?': 626 consume(); 627 parseQuantifier(lastTokenWasAnAtom, 0, 1); 628 lastTokenWasAnAtom = false; 629 break; 630 631 case '{': { 632 ParseState state = saveState(); 633 634 consume(); 635 if (peekIsDigit()) { 636 unsigned min = consumeNumber(); 637 unsigned max = min; 638 639 if (tryConsume(',')) 640 max = peekIsDigit() ? consumeNumber() : quantifyInfinite; 641 642 if (tryConsume('}')) { 643 if (min <= max) 644 parseQuantifier(lastTokenWasAnAtom, min, max); 645 else 646 m_err = QuantifierOutOfOrder; 647 lastTokenWasAnAtom = false; 648 break; 649 } 650 } 651 652 restoreState(state); 653 } // if we did not find a complete quantifer, fall through to the default case. 654 655 default: 656 m_delegate.atomPatternCharacter(consume()); 657 lastTokenWasAnAtom = true; 658 } 659 660 if (m_err) 661 return; 662 } 663 664 if (m_parenthesesNestingDepth > 0) 665 m_err = MissingParentheses; 666 } 667 668 /* 669 * parse(): 670 * 671 * This method calls parseTokens() to parse over the input and converts any 672 * error code to a const char* for a result. 673 */ parse()674 const char* parse() 675 { 676 if (m_size > MAX_PATTERN_SIZE) 677 m_err = PatternTooLarge; 678 else 679 parseTokens(); 680 ASSERT(atEndOfPattern() || m_err); 681 682 // The order of this array must match the ErrorCode enum. 683 static const char* errorMessages[NumberOfErrorCodes] = { 684 0, // NoError 685 REGEXP_ERROR_PREFIX "regular expression too large", 686 REGEXP_ERROR_PREFIX "numbers out of order in {} quantifier", 687 REGEXP_ERROR_PREFIX "nothing to repeat", 688 REGEXP_ERROR_PREFIX "missing )", 689 REGEXP_ERROR_PREFIX "unmatched parentheses", 690 REGEXP_ERROR_PREFIX "unrecognized character after (?", 691 REGEXP_ERROR_PREFIX "missing terminating ] for character class", 692 REGEXP_ERROR_PREFIX "range out of order in character class", 693 REGEXP_ERROR_PREFIX "\\ at end of pattern" 694 }; 695 696 return errorMessages[m_err]; 697 } 698 699 700 // Misc helper functions: 701 702 typedef unsigned ParseState; 703 saveState()704 ParseState saveState() 705 { 706 return m_index; 707 } 708 restoreState(ParseState state)709 void restoreState(ParseState state) 710 { 711 m_index = state; 712 } 713 atEndOfPattern()714 bool atEndOfPattern() 715 { 716 ASSERT(m_index <= m_size); 717 return m_index == m_size; 718 } 719 peek()720 int peek() 721 { 722 ASSERT(m_index < m_size); 723 return m_data[m_index]; 724 } 725 peekIsDigit()726 bool peekIsDigit() 727 { 728 return !atEndOfPattern() && WTF::isASCIIDigit(peek()); 729 } 730 peekDigit()731 unsigned peekDigit() 732 { 733 ASSERT(peekIsDigit()); 734 return peek() - '0'; 735 } 736 consume()737 int consume() 738 { 739 ASSERT(m_index < m_size); 740 return m_data[m_index++]; 741 } 742 consumeDigit()743 unsigned consumeDigit() 744 { 745 ASSERT(peekIsDigit()); 746 return consume() - '0'; 747 } 748 consumeNumber()749 unsigned consumeNumber() 750 { 751 unsigned n = consumeDigit(); 752 // check for overflow. 753 for (unsigned newValue; peekIsDigit() && ((newValue = n * 10 + peekDigit()) >= n); ) { 754 n = newValue; 755 consume(); 756 } 757 return n; 758 } 759 consumeOctal()760 unsigned consumeOctal() 761 { 762 ASSERT(WTF::isASCIIOctalDigit(peek())); 763 764 unsigned n = consumeDigit(); 765 while (n < 32 && !atEndOfPattern() && WTF::isASCIIOctalDigit(peek())) 766 n = n * 8 + consumeDigit(); 767 return n; 768 } 769 tryConsume(UChar ch)770 bool tryConsume(UChar ch) 771 { 772 if (atEndOfPattern() || (m_data[m_index] != ch)) 773 return false; 774 ++m_index; 775 return true; 776 } 777 tryConsumeHex(int count)778 int tryConsumeHex(int count) 779 { 780 ParseState state = saveState(); 781 782 int n = 0; 783 while (count--) { 784 if (atEndOfPattern() || !WTF::isASCIIHexDigit(peek())) { 785 restoreState(state); 786 return -1; 787 } 788 n = (n << 4) | WTF::toASCIIHexValue(consume()); 789 } 790 return n; 791 } 792 793 Delegate& m_delegate; 794 unsigned m_backReferenceLimit; 795 ErrorCode m_err; 796 const UChar* m_data; 797 unsigned m_size; 798 unsigned m_index; 799 unsigned m_parenthesesNestingDepth; 800 801 // Derived by empirical testing of compile time in PCRE and WREC. 802 static const unsigned MAX_PATTERN_SIZE = 1024 * 1024; 803 }; 804 805 /* 806 * Yarr::parse(): 807 * 808 * The parse method is passed a pattern to be parsed and a delegate upon which 809 * callbacks will be made to record the parsed tokens forming the regex. 810 * Yarr::parse() returns null on success, or a const C string providing an error 811 * message where a parse error occurs. 812 * 813 * The Delegate must implement the following interface: 814 * 815 * void assertionBOL(); 816 * void assertionEOL(); 817 * void assertionWordBoundary(bool invert); 818 * 819 * void atomPatternCharacter(UChar ch); 820 * void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert); 821 * void atomCharacterClassBegin(bool invert) 822 * void atomCharacterClassAtom(UChar ch) 823 * void atomCharacterClassRange(UChar begin, UChar end) 824 * void atomCharacterClassBuiltIn(BuiltInCharacterClassID classID, bool invert) 825 * void atomCharacterClassEnd() 826 * void atomParenthesesSubpatternBegin(bool capture = true); 827 * void atomParentheticalAssertionBegin(bool invert = false); 828 * void atomParenthesesEnd(); 829 * void atomBackReference(unsigned subpatternId); 830 * 831 * void quantifyAtom(unsigned min, unsigned max, bool greedy); 832 * 833 * void disjunction(); 834 * 835 * The regular expression is described by a sequence of assertion*() and atom*() 836 * callbacks to the delegate, describing the terms in the regular expression. 837 * Following an atom a quantifyAtom() call may occur to indicate that the previous 838 * atom should be quantified. In the case of atoms described across multiple 839 * calls (parentheses and character classes) the call to quantifyAtom() will come 840 * after the call to the atom*End() method, never after atom*Begin(). 841 * 842 * Character classes may either be described by a single call to 843 * atomBuiltInCharacterClass(), or by a sequence of atomCharacterClass*() calls. 844 * In the latter case, ...Begin() will be called, followed by a sequence of 845 * calls to ...Atom(), ...Range(), and ...BuiltIn(), followed by a call to ...End(). 846 * 847 * Sequences of atoms and assertions are broken into alternatives via calls to 848 * disjunction(). Assertions, atoms, and disjunctions emitted between calls to 849 * atomParenthesesBegin() and atomParenthesesEnd() form the body of a subpattern. 850 * atomParenthesesBegin() is passed a subpatternId. In the case of a regular 851 * capturing subpattern, this will be the subpatternId associated with these 852 * parentheses, and will also by definition be the lowest subpatternId of these 853 * parentheses and of any nested paretheses. The atomParenthesesEnd() method 854 * is passed the subpatternId of the last capturing subexpression nested within 855 * these paretheses. In the case of a capturing subpattern with no nested 856 * capturing subpatterns, the same subpatternId will be passed to the begin and 857 * end functions. In the case of non-capturing subpatterns the subpatternId 858 * passed to the begin method is also the first possible subpatternId that might 859 * be nested within these paretheses. If a set of non-capturing parentheses does 860 * not contain any capturing subpatterns, then the subpatternId passed to begin 861 * will be greater than the subpatternId passed to end. 862 */ 863 864 template<class Delegate> 865 const char* parse(Delegate& delegate, const UString& pattern, unsigned backReferenceLimit = quantifyInfinite) 866 { 867 return Parser<Delegate>(delegate, pattern, backReferenceLimit).parse(); 868 } 869 870 } } // namespace JSC::Yarr 871 872 #endif // YarrParser_h 873