1# 2# Secret Labs' Regular Expression Engine 3# 4# convert re-style regular expression to sre pattern 5# 6# Copyright (c) 1998-2001 by Secret Labs AB. All rights reserved. 7# 8# See the sre.py file for information on usage and redistribution. 9# 10 11"""Internal support module for sre""" 12 13# XXX: show string offset and offending character for all errors 14 15import sys 16 17from sre_constants import * 18 19SPECIAL_CHARS = ".\\[{()*+?^$|" 20REPEAT_CHARS = "*+?{" 21 22DIGITS = set("0123456789") 23 24OCTDIGITS = set("01234567") 25HEXDIGITS = set("0123456789abcdefABCDEF") 26 27WHITESPACE = set(" \t\n\r\v\f") 28 29ESCAPES = { 30 r"\a": (LITERAL, ord("\a")), 31 r"\b": (LITERAL, ord("\b")), 32 r"\f": (LITERAL, ord("\f")), 33 r"\n": (LITERAL, ord("\n")), 34 r"\r": (LITERAL, ord("\r")), 35 r"\t": (LITERAL, ord("\t")), 36 r"\v": (LITERAL, ord("\v")), 37 r"\\": (LITERAL, ord("\\")) 38} 39 40CATEGORIES = { 41 r"\A": (AT, AT_BEGINNING_STRING), # start of string 42 r"\b": (AT, AT_BOUNDARY), 43 r"\B": (AT, AT_NON_BOUNDARY), 44 r"\d": (IN, [(CATEGORY, CATEGORY_DIGIT)]), 45 r"\D": (IN, [(CATEGORY, CATEGORY_NOT_DIGIT)]), 46 r"\s": (IN, [(CATEGORY, CATEGORY_SPACE)]), 47 r"\S": (IN, [(CATEGORY, CATEGORY_NOT_SPACE)]), 48 r"\w": (IN, [(CATEGORY, CATEGORY_WORD)]), 49 r"\W": (IN, [(CATEGORY, CATEGORY_NOT_WORD)]), 50 r"\Z": (AT, AT_END_STRING), # end of string 51} 52 53FLAGS = { 54 # standard flags 55 "i": SRE_FLAG_IGNORECASE, 56 "L": SRE_FLAG_LOCALE, 57 "m": SRE_FLAG_MULTILINE, 58 "s": SRE_FLAG_DOTALL, 59 "x": SRE_FLAG_VERBOSE, 60 # extensions 61 "t": SRE_FLAG_TEMPLATE, 62 "u": SRE_FLAG_UNICODE, 63} 64 65class Pattern: 66 # master pattern object. keeps track of global attributes 67 def __init__(self): 68 self.flags = 0 69 self.open = [] 70 self.groups = 1 71 self.groupdict = {} 72 def opengroup(self, name=None): 73 gid = self.groups 74 self.groups = gid + 1 75 if name is not None: 76 ogid = self.groupdict.get(name, None) 77 if ogid is not None: 78 raise error, ("redefinition of group name %s as group %d; " 79 "was group %d" % (repr(name), gid, ogid)) 80 self.groupdict[name] = gid 81 self.open.append(gid) 82 return gid 83 def closegroup(self, gid): 84 self.open.remove(gid) 85 def checkgroup(self, gid): 86 return gid < self.groups and gid not in self.open 87 88class SubPattern: 89 # a subpattern, in intermediate form 90 def __init__(self, pattern, data=None): 91 self.pattern = pattern 92 if data is None: 93 data = [] 94 self.data = data 95 self.width = None 96 def dump(self, level=0): 97 nl = 1 98 seqtypes = type(()), type([]) 99 for op, av in self.data: 100 print level*" " + op,; nl = 0 101 if op == "in": 102 # member sublanguage 103 print; nl = 1 104 for op, a in av: 105 print (level+1)*" " + op, a 106 elif op == "branch": 107 print; nl = 1 108 i = 0 109 for a in av[1]: 110 if i > 0: 111 print level*" " + "or" 112 a.dump(level+1); nl = 1 113 i = i + 1 114 elif type(av) in seqtypes: 115 for a in av: 116 if isinstance(a, SubPattern): 117 if not nl: print 118 a.dump(level+1); nl = 1 119 else: 120 print a, ; nl = 0 121 else: 122 print av, ; nl = 0 123 if not nl: print 124 def __repr__(self): 125 return repr(self.data) 126 def __len__(self): 127 return len(self.data) 128 def __delitem__(self, index): 129 del self.data[index] 130 def __getitem__(self, index): 131 if isinstance(index, slice): 132 return SubPattern(self.pattern, self.data[index]) 133 return self.data[index] 134 def __setitem__(self, index, code): 135 self.data[index] = code 136 def insert(self, index, code): 137 self.data.insert(index, code) 138 def append(self, code): 139 self.data.append(code) 140 def getwidth(self): 141 # determine the width (min, max) for this subpattern 142 if self.width: 143 return self.width 144 lo = hi = 0L 145 UNITCODES = (ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY) 146 REPEATCODES = (MIN_REPEAT, MAX_REPEAT) 147 for op, av in self.data: 148 if op is BRANCH: 149 i = sys.maxint 150 j = 0 151 for av in av[1]: 152 l, h = av.getwidth() 153 i = min(i, l) 154 j = max(j, h) 155 lo = lo + i 156 hi = hi + j 157 elif op is CALL: 158 i, j = av.getwidth() 159 lo = lo + i 160 hi = hi + j 161 elif op is SUBPATTERN: 162 i, j = av[1].getwidth() 163 lo = lo + i 164 hi = hi + j 165 elif op in REPEATCODES: 166 i, j = av[2].getwidth() 167 lo = lo + long(i) * av[0] 168 hi = hi + long(j) * av[1] 169 elif op in UNITCODES: 170 lo = lo + 1 171 hi = hi + 1 172 elif op == SUCCESS: 173 break 174 self.width = int(min(lo, sys.maxint)), int(min(hi, sys.maxint)) 175 return self.width 176 177class Tokenizer: 178 def __init__(self, string): 179 self.string = string 180 self.index = 0 181 self.__next() 182 def __next(self): 183 if self.index >= len(self.string): 184 self.next = None 185 return 186 char = self.string[self.index] 187 if char[0] == "\\": 188 try: 189 c = self.string[self.index + 1] 190 except IndexError: 191 raise error, "bogus escape (end of line)" 192 char = char + c 193 self.index = self.index + len(char) 194 self.next = char 195 def match(self, char, skip=1): 196 if char == self.next: 197 if skip: 198 self.__next() 199 return 1 200 return 0 201 def get(self): 202 this = self.next 203 self.__next() 204 return this 205 def tell(self): 206 return self.index, self.next 207 def seek(self, index): 208 self.index, self.next = index 209 210def isident(char): 211 return "a" <= char <= "z" or "A" <= char <= "Z" or char == "_" 212 213def isdigit(char): 214 return "0" <= char <= "9" 215 216def isname(name): 217 # check that group name is a valid string 218 if not isident(name[0]): 219 return False 220 for char in name[1:]: 221 if not isident(char) and not isdigit(char): 222 return False 223 return True 224 225def _class_escape(source, escape): 226 # handle escape code inside character class 227 code = ESCAPES.get(escape) 228 if code: 229 return code 230 code = CATEGORIES.get(escape) 231 if code: 232 return code 233 try: 234 c = escape[1:2] 235 if c == "x": 236 # hexadecimal escape (exactly two digits) 237 while source.next in HEXDIGITS and len(escape) < 4: 238 escape = escape + source.get() 239 escape = escape[2:] 240 if len(escape) != 2: 241 raise error, "bogus escape: %s" % repr("\\" + escape) 242 return LITERAL, int(escape, 16) & 0xff 243 elif c in OCTDIGITS: 244 # octal escape (up to three digits) 245 while source.next in OCTDIGITS and len(escape) < 4: 246 escape = escape + source.get() 247 escape = escape[1:] 248 return LITERAL, int(escape, 8) & 0xff 249 elif c in DIGITS: 250 raise error, "bogus escape: %s" % repr(escape) 251 if len(escape) == 2: 252 return LITERAL, ord(escape[1]) 253 except ValueError: 254 pass 255 raise error, "bogus escape: %s" % repr(escape) 256 257def _escape(source, escape, state): 258 # handle escape code in expression 259 code = CATEGORIES.get(escape) 260 if code: 261 return code 262 code = ESCAPES.get(escape) 263 if code: 264 return code 265 try: 266 c = escape[1:2] 267 if c == "x": 268 # hexadecimal escape 269 while source.next in HEXDIGITS and len(escape) < 4: 270 escape = escape + source.get() 271 if len(escape) != 4: 272 raise ValueError 273 return LITERAL, int(escape[2:], 16) & 0xff 274 elif c == "0": 275 # octal escape 276 while source.next in OCTDIGITS and len(escape) < 4: 277 escape = escape + source.get() 278 return LITERAL, int(escape[1:], 8) & 0xff 279 elif c in DIGITS: 280 # octal escape *or* decimal group reference (sigh) 281 if source.next in DIGITS: 282 escape = escape + source.get() 283 if (escape[1] in OCTDIGITS and escape[2] in OCTDIGITS and 284 source.next in OCTDIGITS): 285 # got three octal digits; this is an octal escape 286 escape = escape + source.get() 287 return LITERAL, int(escape[1:], 8) & 0xff 288 # not an octal escape, so this is a group reference 289 group = int(escape[1:]) 290 if group < state.groups: 291 if not state.checkgroup(group): 292 raise error, "cannot refer to open group" 293 return GROUPREF, group 294 raise ValueError 295 if len(escape) == 2: 296 return LITERAL, ord(escape[1]) 297 except ValueError: 298 pass 299 raise error, "bogus escape: %s" % repr(escape) 300 301def _parse_sub(source, state, nested=1): 302 # parse an alternation: a|b|c 303 304 items = [] 305 itemsappend = items.append 306 sourcematch = source.match 307 while 1: 308 itemsappend(_parse(source, state)) 309 if sourcematch("|"): 310 continue 311 if not nested: 312 break 313 if not source.next or sourcematch(")", 0): 314 break 315 else: 316 raise error, "pattern not properly closed" 317 318 if len(items) == 1: 319 return items[0] 320 321 subpattern = SubPattern(state) 322 subpatternappend = subpattern.append 323 324 # check if all items share a common prefix 325 while 1: 326 prefix = None 327 for item in items: 328 if not item: 329 break 330 if prefix is None: 331 prefix = item[0] 332 elif item[0] != prefix: 333 break 334 else: 335 # all subitems start with a common "prefix". 336 # move it out of the branch 337 for item in items: 338 del item[0] 339 subpatternappend(prefix) 340 continue # check next one 341 break 342 343 # check if the branch can be replaced by a character set 344 for item in items: 345 if len(item) != 1 or item[0][0] != LITERAL: 346 break 347 else: 348 # we can store this as a character set instead of a 349 # branch (the compiler may optimize this even more) 350 set = [] 351 setappend = set.append 352 for item in items: 353 setappend(item[0]) 354 subpatternappend((IN, set)) 355 return subpattern 356 357 subpattern.append((BRANCH, (None, items))) 358 return subpattern 359 360def _parse_sub_cond(source, state, condgroup): 361 item_yes = _parse(source, state) 362 if source.match("|"): 363 item_no = _parse(source, state) 364 if source.match("|"): 365 raise error, "conditional backref with more than two branches" 366 else: 367 item_no = None 368 if source.next and not source.match(")", 0): 369 raise error, "pattern not properly closed" 370 subpattern = SubPattern(state) 371 subpattern.append((GROUPREF_EXISTS, (condgroup, item_yes, item_no))) 372 return subpattern 373 374_PATTERNENDERS = set("|)") 375_ASSERTCHARS = set("=!<") 376_LOOKBEHINDASSERTCHARS = set("=!") 377_REPEATCODES = set([MIN_REPEAT, MAX_REPEAT]) 378 379def _parse(source, state): 380 # parse a simple pattern 381 subpattern = SubPattern(state) 382 383 # precompute constants into local variables 384 subpatternappend = subpattern.append 385 sourceget = source.get 386 sourcematch = source.match 387 _len = len 388 PATTERNENDERS = _PATTERNENDERS 389 ASSERTCHARS = _ASSERTCHARS 390 LOOKBEHINDASSERTCHARS = _LOOKBEHINDASSERTCHARS 391 REPEATCODES = _REPEATCODES 392 393 while 1: 394 395 if source.next in PATTERNENDERS: 396 break # end of subpattern 397 this = sourceget() 398 if this is None: 399 break # end of pattern 400 401 if state.flags & SRE_FLAG_VERBOSE: 402 # skip whitespace and comments 403 if this in WHITESPACE: 404 continue 405 if this == "#": 406 while 1: 407 this = sourceget() 408 if this in (None, "\n"): 409 break 410 continue 411 412 if this and this[0] not in SPECIAL_CHARS: 413 subpatternappend((LITERAL, ord(this))) 414 415 elif this == "[": 416 # character set 417 set = [] 418 setappend = set.append 419## if sourcematch(":"): 420## pass # handle character classes 421 if sourcematch("^"): 422 setappend((NEGATE, None)) 423 # check remaining characters 424 start = set[:] 425 while 1: 426 this = sourceget() 427 if this == "]" and set != start: 428 break 429 elif this and this[0] == "\\": 430 code1 = _class_escape(source, this) 431 elif this: 432 code1 = LITERAL, ord(this) 433 else: 434 raise error, "unexpected end of regular expression" 435 if sourcematch("-"): 436 # potential range 437 this = sourceget() 438 if this == "]": 439 if code1[0] is IN: 440 code1 = code1[1][0] 441 setappend(code1) 442 setappend((LITERAL, ord("-"))) 443 break 444 elif this: 445 if this[0] == "\\": 446 code2 = _class_escape(source, this) 447 else: 448 code2 = LITERAL, ord(this) 449 if code1[0] != LITERAL or code2[0] != LITERAL: 450 raise error, "bad character range" 451 lo = code1[1] 452 hi = code2[1] 453 if hi < lo: 454 raise error, "bad character range" 455 setappend((RANGE, (lo, hi))) 456 else: 457 raise error, "unexpected end of regular expression" 458 else: 459 if code1[0] is IN: 460 code1 = code1[1][0] 461 setappend(code1) 462 463 # XXX: <fl> should move set optimization to compiler! 464 if _len(set)==1 and set[0][0] is LITERAL: 465 subpatternappend(set[0]) # optimization 466 elif _len(set)==2 and set[0][0] is NEGATE and set[1][0] is LITERAL: 467 subpatternappend((NOT_LITERAL, set[1][1])) # optimization 468 else: 469 # XXX: <fl> should add charmap optimization here 470 subpatternappend((IN, set)) 471 472 elif this and this[0] in REPEAT_CHARS: 473 # repeat previous item 474 if this == "?": 475 min, max = 0, 1 476 elif this == "*": 477 min, max = 0, MAXREPEAT 478 479 elif this == "+": 480 min, max = 1, MAXREPEAT 481 elif this == "{": 482 if source.next == "}": 483 subpatternappend((LITERAL, ord(this))) 484 continue 485 here = source.tell() 486 min, max = 0, MAXREPEAT 487 lo = hi = "" 488 while source.next in DIGITS: 489 lo = lo + source.get() 490 if sourcematch(","): 491 while source.next in DIGITS: 492 hi = hi + sourceget() 493 else: 494 hi = lo 495 if not sourcematch("}"): 496 subpatternappend((LITERAL, ord(this))) 497 source.seek(here) 498 continue 499 if lo: 500 min = int(lo) 501 if hi: 502 max = int(hi) 503 if max < min: 504 raise error, "bad repeat interval" 505 else: 506 raise error, "not supported" 507 # figure out which item to repeat 508 if subpattern: 509 item = subpattern[-1:] 510 else: 511 item = None 512 if not item or (_len(item) == 1 and item[0][0] == AT): 513 raise error, "nothing to repeat" 514 if item[0][0] in REPEATCODES: 515 raise error, "multiple repeat" 516 if sourcematch("?"): 517 subpattern[-1] = (MIN_REPEAT, (min, max, item)) 518 else: 519 subpattern[-1] = (MAX_REPEAT, (min, max, item)) 520 521 elif this == ".": 522 subpatternappend((ANY, None)) 523 524 elif this == "(": 525 group = 1 526 name = None 527 condgroup = None 528 if sourcematch("?"): 529 group = 0 530 # options 531 if sourcematch("P"): 532 # python extensions 533 if sourcematch("<"): 534 # named group: skip forward to end of name 535 name = "" 536 while 1: 537 char = sourceget() 538 if char is None: 539 raise error, "unterminated name" 540 if char == ">": 541 break 542 name = name + char 543 group = 1 544 if not isname(name): 545 raise error, "bad character in group name" 546 elif sourcematch("="): 547 # named backreference 548 name = "" 549 while 1: 550 char = sourceget() 551 if char is None: 552 raise error, "unterminated name" 553 if char == ")": 554 break 555 name = name + char 556 if not isname(name): 557 raise error, "bad character in group name" 558 gid = state.groupdict.get(name) 559 if gid is None: 560 raise error, "unknown group name" 561 subpatternappend((GROUPREF, gid)) 562 continue 563 else: 564 char = sourceget() 565 if char is None: 566 raise error, "unexpected end of pattern" 567 raise error, "unknown specifier: ?P%s" % char 568 elif sourcematch(":"): 569 # non-capturing group 570 group = 2 571 elif sourcematch("#"): 572 # comment 573 while 1: 574 if source.next is None or source.next == ")": 575 break 576 sourceget() 577 if not sourcematch(")"): 578 raise error, "unbalanced parenthesis" 579 continue 580 elif source.next in ASSERTCHARS: 581 # lookahead assertions 582 char = sourceget() 583 dir = 1 584 if char == "<": 585 if source.next not in LOOKBEHINDASSERTCHARS: 586 raise error, "syntax error" 587 dir = -1 # lookbehind 588 char = sourceget() 589 p = _parse_sub(source, state) 590 if not sourcematch(")"): 591 raise error, "unbalanced parenthesis" 592 if char == "=": 593 subpatternappend((ASSERT, (dir, p))) 594 else: 595 subpatternappend((ASSERT_NOT, (dir, p))) 596 continue 597 elif sourcematch("("): 598 # conditional backreference group 599 condname = "" 600 while 1: 601 char = sourceget() 602 if char is None: 603 raise error, "unterminated name" 604 if char == ")": 605 break 606 condname = condname + char 607 group = 2 608 if isname(condname): 609 condgroup = state.groupdict.get(condname) 610 if condgroup is None: 611 raise error, "unknown group name" 612 else: 613 try: 614 condgroup = int(condname) 615 except ValueError: 616 raise error, "bad character in group name" 617 else: 618 # flags 619 if not source.next in FLAGS: 620 raise error, "unexpected end of pattern" 621 while source.next in FLAGS: 622 state.flags = state.flags | FLAGS[sourceget()] 623 if group: 624 # parse group contents 625 if group == 2: 626 # anonymous group 627 group = None 628 else: 629 group = state.opengroup(name) 630 if condgroup: 631 p = _parse_sub_cond(source, state, condgroup) 632 else: 633 p = _parse_sub(source, state) 634 if not sourcematch(")"): 635 raise error, "unbalanced parenthesis" 636 if group is not None: 637 state.closegroup(group) 638 subpatternappend((SUBPATTERN, (group, p))) 639 else: 640 while 1: 641 char = sourceget() 642 if char is None: 643 raise error, "unexpected end of pattern" 644 if char == ")": 645 break 646 raise error, "unknown extension" 647 648 elif this == "^": 649 subpatternappend((AT, AT_BEGINNING)) 650 651 elif this == "$": 652 subpattern.append((AT, AT_END)) 653 654 elif this and this[0] == "\\": 655 code = _escape(source, this, state) 656 subpatternappend(code) 657 658 else: 659 raise error, "parser error" 660 661 return subpattern 662 663def parse(str, flags=0, pattern=None): 664 # parse 're' pattern into list of (opcode, argument) tuples 665 666 source = Tokenizer(str) 667 668 if pattern is None: 669 pattern = Pattern() 670 pattern.flags = flags 671 pattern.str = str 672 673 p = _parse_sub(source, pattern, 0) 674 675 tail = source.get() 676 if tail == ")": 677 raise error, "unbalanced parenthesis" 678 elif tail: 679 raise error, "bogus characters at end of regular expression" 680 681 if flags & SRE_FLAG_DEBUG: 682 p.dump() 683 684 if not (flags & SRE_FLAG_VERBOSE) and p.pattern.flags & SRE_FLAG_VERBOSE: 685 # the VERBOSE flag was switched on inside the pattern. to be 686 # on the safe side, we'll parse the whole thing again... 687 return parse(str, p.pattern.flags) 688 689 return p 690 691def parse_template(source, pattern): 692 # parse 're' replacement string into list of literals and 693 # group references 694 s = Tokenizer(source) 695 sget = s.get 696 p = [] 697 a = p.append 698 def literal(literal, p=p, pappend=a): 699 if p and p[-1][0] is LITERAL: 700 p[-1] = LITERAL, p[-1][1] + literal 701 else: 702 pappend((LITERAL, literal)) 703 sep = source[:0] 704 if type(sep) is type(""): 705 makechar = chr 706 else: 707 makechar = unichr 708 while 1: 709 this = sget() 710 if this is None: 711 break # end of replacement string 712 if this and this[0] == "\\": 713 # group 714 c = this[1:2] 715 if c == "g": 716 name = "" 717 if s.match("<"): 718 while 1: 719 char = sget() 720 if char is None: 721 raise error, "unterminated group name" 722 if char == ">": 723 break 724 name = name + char 725 if not name: 726 raise error, "bad group name" 727 try: 728 index = int(name) 729 if index < 0: 730 raise error, "negative group number" 731 except ValueError: 732 if not isname(name): 733 raise error, "bad character in group name" 734 try: 735 index = pattern.groupindex[name] 736 except KeyError: 737 raise IndexError, "unknown group name" 738 a((MARK, index)) 739 elif c == "0": 740 if s.next in OCTDIGITS: 741 this = this + sget() 742 if s.next in OCTDIGITS: 743 this = this + sget() 744 literal(makechar(int(this[1:], 8) & 0xff)) 745 elif c in DIGITS: 746 isoctal = False 747 if s.next in DIGITS: 748 this = this + sget() 749 if (c in OCTDIGITS and this[2] in OCTDIGITS and 750 s.next in OCTDIGITS): 751 this = this + sget() 752 isoctal = True 753 literal(makechar(int(this[1:], 8) & 0xff)) 754 if not isoctal: 755 a((MARK, int(this[1:]))) 756 else: 757 try: 758 this = makechar(ESCAPES[this][1]) 759 except KeyError: 760 pass 761 literal(this) 762 else: 763 literal(this) 764 # convert template to groups and literals lists 765 i = 0 766 groups = [] 767 groupsappend = groups.append 768 literals = [None] * len(p) 769 for c, s in p: 770 if c is MARK: 771 groupsappend((i, s)) 772 # literal[i] is already None 773 else: 774 literals[i] = s 775 i = i + 1 776 return groups, literals 777 778def expand_template(template, match): 779 g = match.group 780 sep = match.string[:0] 781 groups, literals = template 782 literals = literals[:] 783 try: 784 for index, group in groups: 785 literals[index] = s = g(group) 786 if s is None: 787 raise error, "unmatched group" 788 except IndexError: 789 raise error, "invalid group reference" 790 return sep.join(literals) 791