1# 2# Secret Labs' Regular Expression Engine 3# 4# convert re-style regular expression to sre pattern 5# 6# Copyright (c) 1998-2001 by Secret Labs AB. All rights reserved. 7# 8# See the sre.py file for information on usage and redistribution. 9# 10 11"""Internal support module for sre""" 12 13# XXX: show string offset and offending character for all errors 14 15import sys 16 17from sre_constants import * 18 19SPECIAL_CHARS = ".\\[{()*+?^$|" 20REPEAT_CHARS = "*+?{" 21 22DIGITS = set("0123456789") 23 24OCTDIGITS = set("01234567") 25HEXDIGITS = set("0123456789abcdefABCDEF") 26 27WHITESPACE = set(" \t\n\r\v\f") 28 29ESCAPES = { 30 r"\a": (LITERAL, ord("\a")), 31 r"\b": (LITERAL, ord("\b")), 32 r"\f": (LITERAL, ord("\f")), 33 r"\n": (LITERAL, ord("\n")), 34 r"\r": (LITERAL, ord("\r")), 35 r"\t": (LITERAL, ord("\t")), 36 r"\v": (LITERAL, ord("\v")), 37 r"\\": (LITERAL, ord("\\")) 38} 39 40CATEGORIES = { 41 r"\A": (AT, AT_BEGINNING_STRING), # start of string 42 r"\b": (AT, AT_BOUNDARY), 43 r"\B": (AT, AT_NON_BOUNDARY), 44 r"\d": (IN, [(CATEGORY, CATEGORY_DIGIT)]), 45 r"\D": (IN, [(CATEGORY, CATEGORY_NOT_DIGIT)]), 46 r"\s": (IN, [(CATEGORY, CATEGORY_SPACE)]), 47 r"\S": (IN, [(CATEGORY, CATEGORY_NOT_SPACE)]), 48 r"\w": (IN, [(CATEGORY, CATEGORY_WORD)]), 49 r"\W": (IN, [(CATEGORY, CATEGORY_NOT_WORD)]), 50 r"\Z": (AT, AT_END_STRING), # end of string 51} 52 53FLAGS = { 54 # standard flags 55 "i": SRE_FLAG_IGNORECASE, 56 "L": SRE_FLAG_LOCALE, 57 "m": SRE_FLAG_MULTILINE, 58 "s": SRE_FLAG_DOTALL, 59 "x": SRE_FLAG_VERBOSE, 60 # extensions 61 "t": SRE_FLAG_TEMPLATE, 62 "u": SRE_FLAG_UNICODE, 63} 64 65class Pattern: 66 # master pattern object. keeps track of global attributes 67 def __init__(self): 68 self.flags = 0 69 self.open = [] 70 self.groups = 1 71 self.groupdict = {} 72 self.lookbehind = 0 73 74 def opengroup(self, name=None): 75 gid = self.groups 76 self.groups = gid + 1 77 if name is not None: 78 ogid = self.groupdict.get(name, None) 79 if ogid is not None: 80 raise error, ("redefinition of group name %s as group %d; " 81 "was group %d" % (repr(name), gid, ogid)) 82 self.groupdict[name] = gid 83 self.open.append(gid) 84 return gid 85 def closegroup(self, gid): 86 self.open.remove(gid) 87 def checkgroup(self, gid): 88 return gid < self.groups and gid not in self.open 89 90class SubPattern: 91 # a subpattern, in intermediate form 92 def __init__(self, pattern, data=None): 93 self.pattern = pattern 94 if data is None: 95 data = [] 96 self.data = data 97 self.width = None 98 def dump(self, level=0): 99 seqtypes = (tuple, list) 100 for op, av in self.data: 101 print level*" " + op, 102 if op == IN: 103 # member sublanguage 104 print 105 for op, a in av: 106 print (level+1)*" " + op, a 107 elif op == BRANCH: 108 print 109 for i, a in enumerate(av[1]): 110 if i: 111 print level*" " + "or" 112 a.dump(level+1) 113 elif op == GROUPREF_EXISTS: 114 condgroup, item_yes, item_no = av 115 print condgroup 116 item_yes.dump(level+1) 117 if item_no: 118 print level*" " + "else" 119 item_no.dump(level+1) 120 elif isinstance(av, seqtypes): 121 nl = 0 122 for a in av: 123 if isinstance(a, SubPattern): 124 if not nl: 125 print 126 a.dump(level+1) 127 nl = 1 128 else: 129 print a, 130 nl = 0 131 if not nl: 132 print 133 else: 134 print av 135 def __repr__(self): 136 return repr(self.data) 137 def __len__(self): 138 return len(self.data) 139 def __delitem__(self, index): 140 del self.data[index] 141 def __getitem__(self, index): 142 if isinstance(index, slice): 143 return SubPattern(self.pattern, self.data[index]) 144 return self.data[index] 145 def __setitem__(self, index, code): 146 self.data[index] = code 147 def insert(self, index, code): 148 self.data.insert(index, code) 149 def append(self, code): 150 self.data.append(code) 151 def getwidth(self): 152 # determine the width (min, max) for this subpattern 153 if self.width: 154 return self.width 155 lo = hi = 0 156 UNITCODES = (ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY) 157 REPEATCODES = (MIN_REPEAT, MAX_REPEAT) 158 for op, av in self.data: 159 if op is BRANCH: 160 i = MAXREPEAT - 1 161 j = 0 162 for av in av[1]: 163 l, h = av.getwidth() 164 i = min(i, l) 165 j = max(j, h) 166 lo = lo + i 167 hi = hi + j 168 elif op is CALL: 169 i, j = av.getwidth() 170 lo = lo + i 171 hi = hi + j 172 elif op is SUBPATTERN: 173 i, j = av[1].getwidth() 174 lo = lo + i 175 hi = hi + j 176 elif op in REPEATCODES: 177 i, j = av[2].getwidth() 178 lo = lo + i * av[0] 179 hi = hi + j * av[1] 180 elif op in UNITCODES: 181 lo = lo + 1 182 hi = hi + 1 183 elif op == SUCCESS: 184 break 185 self.width = min(lo, MAXREPEAT - 1), min(hi, MAXREPEAT) 186 return self.width 187 188class Tokenizer: 189 def __init__(self, string): 190 self.string = string 191 self.index = 0 192 self.__next() 193 def __next(self): 194 if self.index >= len(self.string): 195 self.next = None 196 return 197 char = self.string[self.index] 198 if char[0] == "\\": 199 try: 200 c = self.string[self.index + 1] 201 except IndexError: 202 raise error, "bogus escape (end of line)" 203 char = char + c 204 self.index = self.index + len(char) 205 self.next = char 206 def match(self, char, skip=1): 207 if char == self.next: 208 if skip: 209 self.__next() 210 return 1 211 return 0 212 def get(self): 213 this = self.next 214 self.__next() 215 return this 216 def tell(self): 217 return self.index, self.next 218 def seek(self, index): 219 self.index, self.next = index 220 221def isident(char): 222 return "a" <= char <= "z" or "A" <= char <= "Z" or char == "_" 223 224def isdigit(char): 225 return "0" <= char <= "9" 226 227def isname(name): 228 # check that group name is a valid string 229 if not isident(name[0]): 230 return False 231 for char in name[1:]: 232 if not isident(char) and not isdigit(char): 233 return False 234 return True 235 236def _class_escape(source, escape): 237 # handle escape code inside character class 238 code = ESCAPES.get(escape) 239 if code: 240 return code 241 code = CATEGORIES.get(escape) 242 if code and code[0] == IN: 243 return code 244 try: 245 c = escape[1:2] 246 if c == "x": 247 # hexadecimal escape (exactly two digits) 248 while source.next in HEXDIGITS and len(escape) < 4: 249 escape = escape + source.get() 250 escape = escape[2:] 251 if len(escape) != 2: 252 raise error, "bogus escape: %s" % repr("\\" + escape) 253 return LITERAL, int(escape, 16) & 0xff 254 elif c in OCTDIGITS: 255 # octal escape (up to three digits) 256 while source.next in OCTDIGITS and len(escape) < 4: 257 escape = escape + source.get() 258 escape = escape[1:] 259 return LITERAL, int(escape, 8) & 0xff 260 elif c in DIGITS: 261 raise error, "bogus escape: %s" % repr(escape) 262 if len(escape) == 2: 263 return LITERAL, ord(escape[1]) 264 except ValueError: 265 pass 266 raise error, "bogus escape: %s" % repr(escape) 267 268def _escape(source, escape, state): 269 # handle escape code in expression 270 code = CATEGORIES.get(escape) 271 if code: 272 return code 273 code = ESCAPES.get(escape) 274 if code: 275 return code 276 try: 277 c = escape[1:2] 278 if c == "x": 279 # hexadecimal escape 280 while source.next in HEXDIGITS and len(escape) < 4: 281 escape = escape + source.get() 282 if len(escape) != 4: 283 raise ValueError 284 return LITERAL, int(escape[2:], 16) & 0xff 285 elif c == "0": 286 # octal escape 287 while source.next in OCTDIGITS and len(escape) < 4: 288 escape = escape + source.get() 289 return LITERAL, int(escape[1:], 8) & 0xff 290 elif c in DIGITS: 291 # octal escape *or* decimal group reference (sigh) 292 if source.next in DIGITS: 293 escape = escape + source.get() 294 if (escape[1] in OCTDIGITS and escape[2] in OCTDIGITS and 295 source.next in OCTDIGITS): 296 # got three octal digits; this is an octal escape 297 escape = escape + source.get() 298 return LITERAL, int(escape[1:], 8) & 0xff 299 # not an octal escape, so this is a group reference 300 group = int(escape[1:]) 301 if group < state.groups: 302 if not state.checkgroup(group): 303 raise error, "cannot refer to open group" 304 if state.lookbehind: 305 import warnings 306 warnings.warn('group references in lookbehind ' 307 'assertions are not supported', 308 RuntimeWarning) 309 return GROUPREF, group 310 raise ValueError 311 if len(escape) == 2: 312 return LITERAL, ord(escape[1]) 313 except ValueError: 314 pass 315 raise error, "bogus escape: %s" % repr(escape) 316 317def _parse_sub(source, state, nested=1): 318 # parse an alternation: a|b|c 319 320 items = [] 321 itemsappend = items.append 322 sourcematch = source.match 323 while 1: 324 itemsappend(_parse(source, state)) 325 if sourcematch("|"): 326 continue 327 if not nested: 328 break 329 if not source.next or sourcematch(")", 0): 330 break 331 else: 332 raise error, "pattern not properly closed" 333 334 if len(items) == 1: 335 return items[0] 336 337 subpattern = SubPattern(state) 338 subpatternappend = subpattern.append 339 340 # check if all items share a common prefix 341 while 1: 342 prefix = None 343 for item in items: 344 if not item: 345 break 346 if prefix is None: 347 prefix = item[0] 348 elif item[0] != prefix: 349 break 350 else: 351 # all subitems start with a common "prefix". 352 # move it out of the branch 353 for item in items: 354 del item[0] 355 subpatternappend(prefix) 356 continue # check next one 357 break 358 359 # check if the branch can be replaced by a character set 360 for item in items: 361 if len(item) != 1 or item[0][0] != LITERAL: 362 break 363 else: 364 # we can store this as a character set instead of a 365 # branch (the compiler may optimize this even more) 366 set = [] 367 setappend = set.append 368 for item in items: 369 setappend(item[0]) 370 subpatternappend((IN, set)) 371 return subpattern 372 373 subpattern.append((BRANCH, (None, items))) 374 return subpattern 375 376def _parse_sub_cond(source, state, condgroup): 377 item_yes = _parse(source, state) 378 if source.match("|"): 379 item_no = _parse(source, state) 380 if source.match("|"): 381 raise error, "conditional backref with more than two branches" 382 else: 383 item_no = None 384 if source.next and not source.match(")", 0): 385 raise error, "pattern not properly closed" 386 subpattern = SubPattern(state) 387 subpattern.append((GROUPREF_EXISTS, (condgroup, item_yes, item_no))) 388 return subpattern 389 390_PATTERNENDERS = set("|)") 391_ASSERTCHARS = set("=!<") 392_LOOKBEHINDASSERTCHARS = set("=!") 393_REPEATCODES = set([MIN_REPEAT, MAX_REPEAT]) 394 395def _parse(source, state): 396 # parse a simple pattern 397 subpattern = SubPattern(state) 398 399 # precompute constants into local variables 400 subpatternappend = subpattern.append 401 sourceget = source.get 402 sourcematch = source.match 403 _len = len 404 PATTERNENDERS = _PATTERNENDERS 405 ASSERTCHARS = _ASSERTCHARS 406 LOOKBEHINDASSERTCHARS = _LOOKBEHINDASSERTCHARS 407 REPEATCODES = _REPEATCODES 408 409 while 1: 410 411 if source.next in PATTERNENDERS: 412 break # end of subpattern 413 this = sourceget() 414 if this is None: 415 break # end of pattern 416 417 if state.flags & SRE_FLAG_VERBOSE: 418 # skip whitespace and comments 419 if this in WHITESPACE: 420 continue 421 if this == "#": 422 while 1: 423 this = sourceget() 424 if this in (None, "\n"): 425 break 426 continue 427 428 if this and this[0] not in SPECIAL_CHARS: 429 subpatternappend((LITERAL, ord(this))) 430 431 elif this == "[": 432 # character set 433 set = [] 434 setappend = set.append 435## if sourcematch(":"): 436## pass # handle character classes 437 if sourcematch("^"): 438 setappend((NEGATE, None)) 439 # check remaining characters 440 start = set[:] 441 while 1: 442 this = sourceget() 443 if this == "]" and set != start: 444 break 445 elif this and this[0] == "\\": 446 code1 = _class_escape(source, this) 447 elif this: 448 code1 = LITERAL, ord(this) 449 else: 450 raise error, "unexpected end of regular expression" 451 if sourcematch("-"): 452 # potential range 453 this = sourceget() 454 if this == "]": 455 if code1[0] is IN: 456 code1 = code1[1][0] 457 setappend(code1) 458 setappend((LITERAL, ord("-"))) 459 break 460 elif this: 461 if this[0] == "\\": 462 code2 = _class_escape(source, this) 463 else: 464 code2 = LITERAL, ord(this) 465 if code1[0] != LITERAL or code2[0] != LITERAL: 466 raise error, "bad character range" 467 lo = code1[1] 468 hi = code2[1] 469 if hi < lo: 470 raise error, "bad character range" 471 setappend((RANGE, (lo, hi))) 472 else: 473 raise error, "unexpected end of regular expression" 474 else: 475 if code1[0] is IN: 476 code1 = code1[1][0] 477 setappend(code1) 478 479 # XXX: <fl> should move set optimization to compiler! 480 if _len(set)==1 and set[0][0] is LITERAL: 481 subpatternappend(set[0]) # optimization 482 elif _len(set)==2 and set[0][0] is NEGATE and set[1][0] is LITERAL: 483 subpatternappend((NOT_LITERAL, set[1][1])) # optimization 484 else: 485 # XXX: <fl> should add charmap optimization here 486 subpatternappend((IN, set)) 487 488 elif this and this[0] in REPEAT_CHARS: 489 # repeat previous item 490 if this == "?": 491 min, max = 0, 1 492 elif this == "*": 493 min, max = 0, MAXREPEAT 494 495 elif this == "+": 496 min, max = 1, MAXREPEAT 497 elif this == "{": 498 if source.next == "}": 499 subpatternappend((LITERAL, ord(this))) 500 continue 501 here = source.tell() 502 min, max = 0, MAXREPEAT 503 lo = hi = "" 504 while source.next in DIGITS: 505 lo = lo + source.get() 506 if sourcematch(","): 507 while source.next in DIGITS: 508 hi = hi + sourceget() 509 else: 510 hi = lo 511 if not sourcematch("}"): 512 subpatternappend((LITERAL, ord(this))) 513 source.seek(here) 514 continue 515 if lo: 516 min = int(lo) 517 if min >= MAXREPEAT: 518 raise OverflowError("the repetition number is too large") 519 if hi: 520 max = int(hi) 521 if max >= MAXREPEAT: 522 raise OverflowError("the repetition number is too large") 523 if max < min: 524 raise error("bad repeat interval") 525 else: 526 raise error, "not supported" 527 # figure out which item to repeat 528 if subpattern: 529 item = subpattern[-1:] 530 else: 531 item = None 532 if not item or (_len(item) == 1 and item[0][0] == AT): 533 raise error, "nothing to repeat" 534 if item[0][0] in REPEATCODES: 535 raise error, "multiple repeat" 536 if sourcematch("?"): 537 subpattern[-1] = (MIN_REPEAT, (min, max, item)) 538 else: 539 subpattern[-1] = (MAX_REPEAT, (min, max, item)) 540 541 elif this == ".": 542 subpatternappend((ANY, None)) 543 544 elif this == "(": 545 group = 1 546 name = None 547 condgroup = None 548 if sourcematch("?"): 549 group = 0 550 # options 551 if sourcematch("P"): 552 # python extensions 553 if sourcematch("<"): 554 # named group: skip forward to end of name 555 name = "" 556 while 1: 557 char = sourceget() 558 if char is None: 559 raise error, "unterminated name" 560 if char == ">": 561 break 562 name = name + char 563 group = 1 564 if not name: 565 raise error("missing group name") 566 if not isname(name): 567 raise error("bad character in group name %r" % 568 name) 569 elif sourcematch("="): 570 # named backreference 571 name = "" 572 while 1: 573 char = sourceget() 574 if char is None: 575 raise error, "unterminated name" 576 if char == ")": 577 break 578 name = name + char 579 if not name: 580 raise error("missing group name") 581 if not isname(name): 582 raise error("bad character in backref group name " 583 "%r" % name) 584 gid = state.groupdict.get(name) 585 if gid is None: 586 msg = "unknown group name: {0!r}".format(name) 587 raise error(msg) 588 if state.lookbehind: 589 import warnings 590 warnings.warn('group references in lookbehind ' 591 'assertions are not supported', 592 RuntimeWarning) 593 subpatternappend((GROUPREF, gid)) 594 continue 595 else: 596 char = sourceget() 597 if char is None: 598 raise error, "unexpected end of pattern" 599 raise error, "unknown specifier: ?P%s" % char 600 elif sourcematch(":"): 601 # non-capturing group 602 group = 2 603 elif sourcematch("#"): 604 # comment 605 while 1: 606 if source.next is None or source.next == ")": 607 break 608 sourceget() 609 if not sourcematch(")"): 610 raise error, "unbalanced parenthesis" 611 continue 612 elif source.next in ASSERTCHARS: 613 # lookahead assertions 614 char = sourceget() 615 dir = 1 616 if char == "<": 617 if source.next not in LOOKBEHINDASSERTCHARS: 618 raise error, "syntax error" 619 dir = -1 # lookbehind 620 char = sourceget() 621 state.lookbehind += 1 622 p = _parse_sub(source, state) 623 if dir < 0: 624 state.lookbehind -= 1 625 if not sourcematch(")"): 626 raise error, "unbalanced parenthesis" 627 if char == "=": 628 subpatternappend((ASSERT, (dir, p))) 629 else: 630 subpatternappend((ASSERT_NOT, (dir, p))) 631 continue 632 elif sourcematch("("): 633 # conditional backreference group 634 condname = "" 635 while 1: 636 char = sourceget() 637 if char is None: 638 raise error, "unterminated name" 639 if char == ")": 640 break 641 condname = condname + char 642 group = 2 643 if not condname: 644 raise error("missing group name") 645 if isname(condname): 646 condgroup = state.groupdict.get(condname) 647 if condgroup is None: 648 msg = "unknown group name: {0!r}".format(condname) 649 raise error(msg) 650 else: 651 try: 652 condgroup = int(condname) 653 except ValueError: 654 raise error, "bad character in group name" 655 if state.lookbehind: 656 import warnings 657 warnings.warn('group references in lookbehind ' 658 'assertions are not supported', 659 RuntimeWarning) 660 else: 661 # flags 662 if not source.next in FLAGS: 663 raise error, "unexpected end of pattern" 664 while source.next in FLAGS: 665 state.flags = state.flags | FLAGS[sourceget()] 666 if group: 667 # parse group contents 668 if group == 2: 669 # anonymous group 670 group = None 671 else: 672 group = state.opengroup(name) 673 if condgroup: 674 p = _parse_sub_cond(source, state, condgroup) 675 else: 676 p = _parse_sub(source, state) 677 if not sourcematch(")"): 678 raise error, "unbalanced parenthesis" 679 if group is not None: 680 state.closegroup(group) 681 subpatternappend((SUBPATTERN, (group, p))) 682 else: 683 while 1: 684 char = sourceget() 685 if char is None: 686 raise error, "unexpected end of pattern" 687 if char == ")": 688 break 689 raise error, "unknown extension" 690 691 elif this == "^": 692 subpatternappend((AT, AT_BEGINNING)) 693 694 elif this == "$": 695 subpattern.append((AT, AT_END)) 696 697 elif this and this[0] == "\\": 698 code = _escape(source, this, state) 699 subpatternappend(code) 700 701 else: 702 raise error, "parser error" 703 704 return subpattern 705 706def parse(str, flags=0, pattern=None): 707 # parse 're' pattern into list of (opcode, argument) tuples 708 709 source = Tokenizer(str) 710 711 if pattern is None: 712 pattern = Pattern() 713 pattern.flags = flags 714 pattern.str = str 715 716 p = _parse_sub(source, pattern, 0) 717 718 tail = source.get() 719 if tail == ")": 720 raise error, "unbalanced parenthesis" 721 elif tail: 722 raise error, "bogus characters at end of regular expression" 723 724 if flags & SRE_FLAG_DEBUG: 725 p.dump() 726 727 if not (flags & SRE_FLAG_VERBOSE) and p.pattern.flags & SRE_FLAG_VERBOSE: 728 # the VERBOSE flag was switched on inside the pattern. to be 729 # on the safe side, we'll parse the whole thing again... 730 return parse(str, p.pattern.flags) 731 732 return p 733 734def parse_template(source, pattern): 735 # parse 're' replacement string into list of literals and 736 # group references 737 s = Tokenizer(source) 738 sget = s.get 739 p = [] 740 a = p.append 741 def literal(literal, p=p, pappend=a): 742 if p and p[-1][0] is LITERAL: 743 p[-1] = LITERAL, p[-1][1] + literal 744 else: 745 pappend((LITERAL, literal)) 746 sep = source[:0] 747 if type(sep) is type(""): 748 makechar = chr 749 else: 750 makechar = unichr 751 while 1: 752 this = sget() 753 if this is None: 754 break # end of replacement string 755 if this and this[0] == "\\": 756 # group 757 c = this[1:2] 758 if c == "g": 759 name = "" 760 if s.match("<"): 761 while 1: 762 char = sget() 763 if char is None: 764 raise error, "unterminated group name" 765 if char == ">": 766 break 767 name = name + char 768 if not name: 769 raise error, "missing group name" 770 try: 771 index = int(name) 772 if index < 0: 773 raise error, "negative group number" 774 except ValueError: 775 if not isname(name): 776 raise error, "bad character in group name" 777 try: 778 index = pattern.groupindex[name] 779 except KeyError: 780 msg = "unknown group name: {0!r}".format(name) 781 raise IndexError(msg) 782 a((MARK, index)) 783 elif c == "0": 784 if s.next in OCTDIGITS: 785 this = this + sget() 786 if s.next in OCTDIGITS: 787 this = this + sget() 788 literal(makechar(int(this[1:], 8) & 0xff)) 789 elif c in DIGITS: 790 isoctal = False 791 if s.next in DIGITS: 792 this = this + sget() 793 if (c in OCTDIGITS and this[2] in OCTDIGITS and 794 s.next in OCTDIGITS): 795 this = this + sget() 796 isoctal = True 797 literal(makechar(int(this[1:], 8) & 0xff)) 798 if not isoctal: 799 a((MARK, int(this[1:]))) 800 else: 801 try: 802 this = makechar(ESCAPES[this][1]) 803 except KeyError: 804 pass 805 literal(this) 806 else: 807 literal(this) 808 # convert template to groups and literals lists 809 i = 0 810 groups = [] 811 groupsappend = groups.append 812 literals = [None] * len(p) 813 for c, s in p: 814 if c is MARK: 815 groupsappend((i, s)) 816 # literal[i] is already None 817 else: 818 literals[i] = s 819 i = i + 1 820 return groups, literals 821 822def expand_template(template, match): 823 g = match.group 824 sep = match.string[:0] 825 groups, literals = template 826 literals = literals[:] 827 try: 828 for index, group in groups: 829 literals[index] = s = g(group) 830 if s is None: 831 raise error, "unmatched group" 832 except IndexError: 833 raise error, "invalid group reference" 834 return sep.join(literals) 835