1 # 2 # Secret Labs' Regular Expression Engine 3 # 4 # convert re-style regular expression to sre pattern 5 # 6 # Copyright (c) 1998-2001 by Secret Labs AB. All rights reserved. 7 # 8 # See the sre.py file for information on usage and redistribution. 9 # 10 11 """Internal support module for sre""" 12 13 # XXX: show string offset and offending character for all errors 14 15 from sre_constants import * 16 17 SPECIAL_CHARS = ".\\[{()*+?^$|" 18 REPEAT_CHARS = "*+?{" 19 20 DIGITS = frozenset("0123456789") 21 22 OCTDIGITS = frozenset("01234567") 23 HEXDIGITS = frozenset("0123456789abcdefABCDEF") 24 ASCIILETTERS = frozenset("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ") 25 26 WHITESPACE = frozenset(" \t\n\r\v\f") 27 28 _REPEATCODES = frozenset({MIN_REPEAT, MAX_REPEAT}) 29 _UNITCODES = frozenset({ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY}) 30 31 ESCAPES = { 32 r"\a": (LITERAL, ord("\a")), 33 r"\b": (LITERAL, ord("\b")), 34 r"\f": (LITERAL, ord("\f")), 35 r"\n": (LITERAL, ord("\n")), 36 r"\r": (LITERAL, ord("\r")), 37 r"\t": (LITERAL, ord("\t")), 38 r"\v": (LITERAL, ord("\v")), 39 r"\\": (LITERAL, ord("\\")) 40 } 41 42 CATEGORIES = { 43 r"\A": (AT, AT_BEGINNING_STRING), # start of string 44 r"\b": (AT, AT_BOUNDARY), 45 r"\B": (AT, AT_NON_BOUNDARY), 46 r"\d": (IN, [(CATEGORY, CATEGORY_DIGIT)]), 47 r"\D": (IN, [(CATEGORY, CATEGORY_NOT_DIGIT)]), 48 r"\s": (IN, [(CATEGORY, CATEGORY_SPACE)]), 49 r"\S": (IN, [(CATEGORY, CATEGORY_NOT_SPACE)]), 50 r"\w": (IN, [(CATEGORY, CATEGORY_WORD)]), 51 r"\W": (IN, [(CATEGORY, CATEGORY_NOT_WORD)]), 52 r"\Z": (AT, AT_END_STRING), # end of string 53 } 54 55 FLAGS = { 56 # standard flags 57 "i": SRE_FLAG_IGNORECASE, 58 "L": SRE_FLAG_LOCALE, 59 "m": SRE_FLAG_MULTILINE, 60 "s": SRE_FLAG_DOTALL, 61 "x": SRE_FLAG_VERBOSE, 62 # extensions 63 "a": SRE_FLAG_ASCII, 64 "t": SRE_FLAG_TEMPLATE, 65 "u": SRE_FLAG_UNICODE, 66 } 67 68 TYPE_FLAGS = SRE_FLAG_ASCII | SRE_FLAG_LOCALE | SRE_FLAG_UNICODE 69 GLOBAL_FLAGS = SRE_FLAG_DEBUG | SRE_FLAG_TEMPLATE 70 71 class Verbose(Exception): 72 pass 73 74 class State: 75 # keeps track of state for parsing 76 def __init__(self): 77 self.flags = 0 78 self.groupdict = {} 79 self.groupwidths = [None] # group 0 80 self.lookbehindgroups = None 81 @property 82 def groups(self): 83 return len(self.groupwidths) 84 def opengroup(self, name=None): 85 gid = self.groups 86 self.groupwidths.append(None) 87 if self.groups > MAXGROUPS: 88 raise error("too many groups") 89 if name is not None: 90 ogid = self.groupdict.get(name, None) 91 if ogid is not None: 92 raise error("redefinition of group name %r as group %d; " 93 "was group %d" % (name, gid, ogid)) 94 self.groupdict[name] = gid 95 return gid 96 def closegroup(self, gid, p): 97 self.groupwidths[gid] = p.getwidth() 98 def checkgroup(self, gid): 99 return gid < self.groups and self.groupwidths[gid] is not None 100 101 def checklookbehindgroup(self, gid, source): 102 if self.lookbehindgroups is not None: 103 if not self.checkgroup(gid): 104 raise source.error('cannot refer to an open group') 105 if gid >= self.lookbehindgroups: 106 raise source.error('cannot refer to group defined in the same ' 107 'lookbehind subpattern') 108 109 class SubPattern: 110 # a subpattern, in intermediate form 111 def __init__(self, state, data=None): 112 self.state = state 113 if data is None: 114 data = [] 115 self.data = data 116 self.width = None 117 118 def dump(self, level=0): 119 nl = True 120 seqtypes = (tuple, list) 121 for op, av in self.data: 122 print(level*" " + str(op), end='') 123 if op is IN: 124 # member sublanguage 125 print() 126 for op, a in av: 127 print((level+1)*" " + str(op), a) 128 elif op is BRANCH: 129 print() 130 for i, a in enumerate(av[1]): 131 if i: 132 print(level*" " + "OR") 133 a.dump(level+1) 134 elif op is GROUPREF_EXISTS: 135 condgroup, item_yes, item_no = av 136 print('', condgroup) 137 item_yes.dump(level+1) 138 if item_no: 139 print(level*" " + "ELSE") 140 item_no.dump(level+1) 141 elif isinstance(av, seqtypes): 142 nl = False 143 for a in av: 144 if isinstance(a, SubPattern): 145 if not nl: 146 print() 147 a.dump(level+1) 148 nl = True 149 else: 150 if not nl: 151 print(' ', end='') 152 print(a, end='') 153 nl = False 154 if not nl: 155 print() 156 else: 157 print('', av) 158 def __repr__(self): 159 return repr(self.data) 160 def __len__(self): 161 return len(self.data) 162 def __delitem__(self, index): 163 del self.data[index] 164 def __getitem__(self, index): 165 if isinstance(index, slice): 166 return SubPattern(self.state, self.data[index]) 167 return self.data[index] 168 def __setitem__(self, index, code): 169 self.data[index] = code 170 def insert(self, index, code): 171 self.data.insert(index, code) 172 def append(self, code): 173 self.data.append(code) 174 def getwidth(self): 175 # determine the width (min, max) for this subpattern 176 if self.width is not None: 177 return self.width 178 lo = hi = 0 179 for op, av in self.data: 180 if op is BRANCH: 181 i = MAXREPEAT - 1 182 j = 0 183 for av in av[1]: 184 l, h = av.getwidth() 185 i = min(i, l) 186 j = max(j, h) 187 lo = lo + i 188 hi = hi + j 189 elif op is CALL: 190 i, j = av.getwidth() 191 lo = lo + i 192 hi = hi + j 193 elif op is SUBPATTERN: 194 i, j = av[-1].getwidth() 195 lo = lo + i 196 hi = hi + j 197 elif op in _REPEATCODES: 198 i, j = av[2].getwidth() 199 lo = lo + i * av[0] 200 hi = hi + j * av[1] 201 elif op in _UNITCODES: 202 lo = lo + 1 203 hi = hi + 1 204 elif op is GROUPREF: 205 i, j = self.state.groupwidths[av] 206 lo = lo + i 207 hi = hi + j 208 elif op is GROUPREF_EXISTS: 209 i, j = av[1].getwidth() 210 if av[2] is not None: 211 l, h = av[2].getwidth() 212 i = min(i, l) 213 j = max(j, h) 214 else: 215 i = 0 216 lo = lo + i 217 hi = hi + j 218 elif op is SUCCESS: 219 break 220 self.width = min(lo, MAXREPEAT - 1), min(hi, MAXREPEAT) 221 return self.width 222 223 class Tokenizer: 224 def __init__(self, string): 225 self.istext = isinstance(string, str) 226 self.string = string 227 if not self.istext: 228 string = str(string, 'latin1') 229 self.decoded_string = string 230 self.index = 0 231 self.next = None 232 self.__next() 233 def __next(self): 234 index = self.index 235 try: 236 char = self.decoded_string[index] 237 except IndexError: 238 self.next = None 239 return 240 if char == "\\": 241 index += 1 242 try: 243 char += self.decoded_string[index] 244 except IndexError: 245 raise error("bad escape (end of pattern)", 246 self.string, len(self.string) - 1) from None 247 self.index = index + 1 248 self.next = char 249 def match(self, char): 250 if char == self.next: 251 self.__next() 252 return True 253 return False 254 def get(self): 255 this = self.next 256 self.__next() 257 return this 258 def getwhile(self, n, charset): 259 result = '' 260 for _ in range(n): 261 c = self.next 262 if c not in charset: 263 break 264 result += c 265 self.__next() 266 return result 267 def getuntil(self, terminator, name): 268 result = '' 269 while True: 270 c = self.next 271 self.__next() 272 if c is None: 273 if not result: 274 raise self.error("missing " + name) 275 raise self.error("missing %s, unterminated name" % terminator, 276 len(result)) 277 if c == terminator: 278 if not result: 279 raise self.error("missing " + name, 1) 280 break 281 result += c 282 return result 283 @property 284 def pos(self): 285 return self.index - len(self.next or '') 286 def tell(self): 287 return self.index - len(self.next or '') 288 def seek(self, index): 289 self.index = index 290 self.__next() 291 292 def error(self, msg, offset=0): 293 return error(msg, self.string, self.tell() - offset) 294 295 def _class_escape(source, escape): 296 # handle escape code inside character class 297 code = ESCAPES.get(escape) 298 if code: 299 return code 300 code = CATEGORIES.get(escape) 301 if code and code[0] is IN: 302 return code 303 try: 304 c = escape[1:2] 305 if c == "x": 306 # hexadecimal escape (exactly two digits) 307 escape += source.getwhile(2, HEXDIGITS) 308 if len(escape) != 4: 309 raise source.error("incomplete escape %s" % escape, len(escape)) 310 return LITERAL, int(escape[2:], 16) 311 elif c == "u" and source.istext: 312 # unicode escape (exactly four digits) 313 escape += source.getwhile(4, HEXDIGITS) 314 if len(escape) != 6: 315 raise source.error("incomplete escape %s" % escape, len(escape)) 316 return LITERAL, int(escape[2:], 16) 317 elif c == "U" and source.istext: 318 # unicode escape (exactly eight digits) 319 escape += source.getwhile(8, HEXDIGITS) 320 if len(escape) != 10: 321 raise source.error("incomplete escape %s" % escape, len(escape)) 322 c = int(escape[2:], 16) 323 chr(c) # raise ValueError for invalid code 324 return LITERAL, c 325 elif c == "N" and source.istext: 326 import unicodedata 327 # named unicode escape e.g. \N{EM DASH} 328 if not source.match('{'): 329 raise source.error("missing {") 330 charname = source.getuntil('}', 'character name') 331 try: 332 c = ord(unicodedata.lookup(charname)) 333 except KeyError: 334 raise source.error("undefined character name %r" % charname, 335 len(charname) + len(r'\N{}')) 336 return LITERAL, c 337 elif c in OCTDIGITS: 338 # octal escape (up to three digits) 339 escape += source.getwhile(2, OCTDIGITS) 340 c = int(escape[1:], 8) 341 if c > 0o377: 342 raise source.error('octal escape value %s outside of ' 343 'range 0-0o377' % escape, len(escape)) 344 return LITERAL, c 345 elif c in DIGITS: 346 raise ValueError 347 if len(escape) == 2: 348 if c in ASCIILETTERS: 349 raise source.error('bad escape %s' % escape, len(escape)) 350 return LITERAL, ord(escape[1]) 351 except ValueError: 352 pass 353 raise source.error("bad escape %s" % escape, len(escape)) 354 355 def _escape(source, escape, state): 356 # handle escape code in expression 357 code = CATEGORIES.get(escape) 358 if code: 359 return code 360 code = ESCAPES.get(escape) 361 if code: 362 return code 363 try: 364 c = escape[1:2] 365 if c == "x": 366 # hexadecimal escape 367 escape += source.getwhile(2, HEXDIGITS) 368 if len(escape) != 4: 369 raise source.error("incomplete escape %s" % escape, len(escape)) 370 return LITERAL, int(escape[2:], 16) 371 elif c == "u" and source.istext: 372 # unicode escape (exactly four digits) 373 escape += source.getwhile(4, HEXDIGITS) 374 if len(escape) != 6: 375 raise source.error("incomplete escape %s" % escape, len(escape)) 376 return LITERAL, int(escape[2:], 16) 377 elif c == "U" and source.istext: 378 # unicode escape (exactly eight digits) 379 escape += source.getwhile(8, HEXDIGITS) 380 if len(escape) != 10: 381 raise source.error("incomplete escape %s" % escape, len(escape)) 382 c = int(escape[2:], 16) 383 chr(c) # raise ValueError for invalid code 384 return LITERAL, c 385 elif c == "N" and source.istext: 386 import unicodedata 387 # named unicode escape e.g. \N{EM DASH} 388 if not source.match('{'): 389 raise source.error("missing {") 390 charname = source.getuntil('}', 'character name') 391 try: 392 c = ord(unicodedata.lookup(charname)) 393 except KeyError: 394 raise source.error("undefined character name %r" % charname, 395 len(charname) + len(r'\N{}')) 396 return LITERAL, c 397 elif c == "0": 398 # octal escape 399 escape += source.getwhile(2, OCTDIGITS) 400 return LITERAL, int(escape[1:], 8) 401 elif c in DIGITS: 402 # octal escape *or* decimal group reference (sigh) 403 if source.next in DIGITS: 404 escape += source.get() 405 if (escape[1] in OCTDIGITS and escape[2] in OCTDIGITS and 406 source.next in OCTDIGITS): 407 # got three octal digits; this is an octal escape 408 escape += source.get() 409 c = int(escape[1:], 8) 410 if c > 0o377: 411 raise source.error('octal escape value %s outside of ' 412 'range 0-0o377' % escape, 413 len(escape)) 414 return LITERAL, c 415 # not an octal escape, so this is a group reference 416 group = int(escape[1:]) 417 if group < state.groups: 418 if not state.checkgroup(group): 419 raise source.error("cannot refer to an open group", 420 len(escape)) 421 state.checklookbehindgroup(group, source) 422 return GROUPREF, group 423 raise source.error("invalid group reference %d" % group, len(escape) - 1) 424 if len(escape) == 2: 425 if c in ASCIILETTERS: 426 raise source.error("bad escape %s" % escape, len(escape)) 427 return LITERAL, ord(escape[1]) 428 except ValueError: 429 pass 430 raise source.error("bad escape %s" % escape, len(escape)) 431 432 def _uniq(items): 433 return list(dict.fromkeys(items)) 434 435 def _parse_sub(source, state, verbose, nested): 436 # parse an alternation: a|b|c 437 438 items = [] 439 itemsappend = items.append 440 sourcematch = source.match 441 start = source.tell() 442 while True: 443 itemsappend(_parse(source, state, verbose, nested + 1, 444 not nested and not items)) 445 if not sourcematch("|"): 446 break 447 448 if len(items) == 1: 449 return items[0] 450 451 subpattern = SubPattern(state) 452 453 # check if all items share a common prefix 454 while True: 455 prefix = None 456 for item in items: 457 if not item: 458 break 459 if prefix is None: 460 prefix = item[0] 461 elif item[0] != prefix: 462 break 463 else: 464 # all subitems start with a common "prefix". 465 # move it out of the branch 466 for item in items: 467 del item[0] 468 subpattern.append(prefix) 469 continue # check next one 470 break 471 472 # check if the branch can be replaced by a character set 473 set = [] 474 for item in items: 475 if len(item) != 1: 476 break 477 op, av = item[0] 478 if op is LITERAL: 479 set.append((op, av)) 480 elif op is IN and av[0][0] is not NEGATE: 481 set.extend(av) 482 else: 483 break 484 else: 485 # we can store this as a character set instead of a 486 # branch (the compiler may optimize this even more) 487 subpattern.append((IN, _uniq(set))) 488 return subpattern 489 490 subpattern.append((BRANCH, (None, items))) 491 return subpattern 492 493 def _parse(source, state, verbose, nested, first=False): 494 # parse a simple pattern 495 subpattern = SubPattern(state) 496 497 # precompute constants into local variables 498 subpatternappend = subpattern.append 499 sourceget = source.get 500 sourcematch = source.match 501 _len = len 502 _ord = ord 503 504 while True: 505 506 this = source.next 507 if this is None: 508 break # end of pattern 509 if this in "|)": 510 break # end of subpattern 511 sourceget() 512 513 if verbose: 514 # skip whitespace and comments 515 if this in WHITESPACE: 516 continue 517 if this == "#": 518 while True: 519 this = sourceget() 520 if this is None or this == "\n": 521 break 522 continue 523 524 if this[0] == "\\": 525 code = _escape(source, this, state) 526 subpatternappend(code) 527 528 elif this not in SPECIAL_CHARS: 529 subpatternappend((LITERAL, _ord(this))) 530 531 elif this == "[": 532 here = source.tell() - 1 533 # character set 534 set = [] 535 setappend = set.append 536 ## if sourcematch(":"): 537 ## pass # handle character classes 538 if source.next == '[': 539 import warnings 540 warnings.warn( 541 'Possible nested set at position %d' % source.tell(), 542 FutureWarning, stacklevel=nested + 6 543 ) 544 negate = sourcematch("^") 545 # check remaining characters 546 while True: 547 this = sourceget() 548 if this is None: 549 raise source.error("unterminated character set", 550 source.tell() - here) 551 if this == "]" and set: 552 break 553 elif this[0] == "\\": 554 code1 = _class_escape(source, this) 555 else: 556 if set and this in '-&~|' and source.next == this: 557 import warnings 558 warnings.warn( 559 'Possible set %s at position %d' % ( 560 'difference' if this == '-' else 561 'intersection' if this == '&' else 562 'symmetric difference' if this == '~' else 563 'union', 564 source.tell() - 1), 565 FutureWarning, stacklevel=nested + 6 566 ) 567 code1 = LITERAL, _ord(this) 568 if sourcematch("-"): 569 # potential range 570 that = sourceget() 571 if that is None: 572 raise source.error("unterminated character set", 573 source.tell() - here) 574 if that == "]": 575 if code1[0] is IN: 576 code1 = code1[1][0] 577 setappend(code1) 578 setappend((LITERAL, _ord("-"))) 579 break 580 if that[0] == "\\": 581 code2 = _class_escape(source, that) 582 else: 583 if that == '-': 584 import warnings 585 warnings.warn( 586 'Possible set difference at position %d' % ( 587 source.tell() - 2), 588 FutureWarning, stacklevel=nested + 6 589 ) 590 code2 = LITERAL, _ord(that) 591 if code1[0] != LITERAL or code2[0] != LITERAL: 592 msg = "bad character range %s-%s" % (this, that) 593 raise source.error(msg, len(this) + 1 + len(that)) 594 lo = code1[1] 595 hi = code2[1] 596 if hi < lo: 597 msg = "bad character range %s-%s" % (this, that) 598 raise source.error(msg, len(this) + 1 + len(that)) 599 setappend((RANGE, (lo, hi))) 600 else: 601 if code1[0] is IN: 602 code1 = code1[1][0] 603 setappend(code1) 604 605 set = _uniq(set) 606 # XXX: <fl> should move set optimization to compiler! 607 if _len(set) == 1 and set[0][0] is LITERAL: 608 # optimization 609 if negate: 610 subpatternappend((NOT_LITERAL, set[0][1])) 611 else: 612 subpatternappend(set[0]) 613 else: 614 if negate: 615 set.insert(0, (NEGATE, None)) 616 # charmap optimization can't be added here because 617 # global flags still are not known 618 subpatternappend((IN, set)) 619 620 elif this in REPEAT_CHARS: 621 # repeat previous item 622 here = source.tell() 623 if this == "?": 624 min, max = 0, 1 625 elif this == "*": 626 min, max = 0, MAXREPEAT 627 628 elif this == "+": 629 min, max = 1, MAXREPEAT 630 elif this == "{": 631 if source.next == "}": 632 subpatternappend((LITERAL, _ord(this))) 633 continue 634 635 min, max = 0, MAXREPEAT 636 lo = hi = "" 637 while source.next in DIGITS: 638 lo += sourceget() 639 if sourcematch(","): 640 while source.next in DIGITS: 641 hi += sourceget() 642 else: 643 hi = lo 644 if not sourcematch("}"): 645 subpatternappend((LITERAL, _ord(this))) 646 source.seek(here) 647 continue 648 649 if lo: 650 min = int(lo) 651 if min >= MAXREPEAT: 652 raise OverflowError("the repetition number is too large") 653 if hi: 654 max = int(hi) 655 if max >= MAXREPEAT: 656 raise OverflowError("the repetition number is too large") 657 if max < min: 658 raise source.error("min repeat greater than max repeat", 659 source.tell() - here) 660 else: 661 raise AssertionError("unsupported quantifier %r" % (char,)) 662 # figure out which item to repeat 663 if subpattern: 664 item = subpattern[-1:] 665 else: 666 item = None 667 if not item or item[0][0] is AT: 668 raise source.error("nothing to repeat", 669 source.tell() - here + len(this)) 670 if item[0][0] in _REPEATCODES: 671 raise source.error("multiple repeat", 672 source.tell() - here + len(this)) 673 if item[0][0] is SUBPATTERN: 674 group, add_flags, del_flags, p = item[0][1] 675 if group is None and not add_flags and not del_flags: 676 item = p 677 if sourcematch("?"): 678 subpattern[-1] = (MIN_REPEAT, (min, max, item)) 679 else: 680 subpattern[-1] = (MAX_REPEAT, (min, max, item)) 681 682 elif this == ".": 683 subpatternappend((ANY, None)) 684 685 elif this == "(": 686 start = source.tell() - 1 687 group = True 688 name = None 689 add_flags = 0 690 del_flags = 0 691 if sourcematch("?"): 692 # options 693 char = sourceget() 694 if char is None: 695 raise source.error("unexpected end of pattern") 696 if char == "P": 697 # python extensions 698 if sourcematch("<"): 699 # named group: skip forward to end of name 700 name = source.getuntil(">", "group name") 701 if not name.isidentifier(): 702 msg = "bad character in group name %r" % name 703 raise source.error(msg, len(name) + 1) 704 elif sourcematch("="): 705 # named backreference 706 name = source.getuntil(")", "group name") 707 if not name.isidentifier(): 708 msg = "bad character in group name %r" % name 709 raise source.error(msg, len(name) + 1) 710 gid = state.groupdict.get(name) 711 if gid is None: 712 msg = "unknown group name %r" % name 713 raise source.error(msg, len(name) + 1) 714 if not state.checkgroup(gid): 715 raise source.error("cannot refer to an open group", 716 len(name) + 1) 717 state.checklookbehindgroup(gid, source) 718 subpatternappend((GROUPREF, gid)) 719 continue 720 721 else: 722 char = sourceget() 723 if char is None: 724 raise source.error("unexpected end of pattern") 725 raise source.error("unknown extension ?P" + char, 726 len(char) + 2) 727 elif char == ":": 728 # non-capturing group 729 group = None 730 elif char == "#": 731 # comment 732 while True: 733 if source.next is None: 734 raise source.error("missing ), unterminated comment", 735 source.tell() - start) 736 if sourceget() == ")": 737 break 738 continue 739 740 elif char in "=!<": 741 # lookahead assertions 742 dir = 1 743 if char == "<": 744 char = sourceget() 745 if char is None: 746 raise source.error("unexpected end of pattern") 747 if char not in "=!": 748 raise source.error("unknown extension ?<" + char, 749 len(char) + 2) 750 dir = -1 # lookbehind 751 lookbehindgroups = state.lookbehindgroups 752 if lookbehindgroups is None: 753 state.lookbehindgroups = state.groups 754 p = _parse_sub(source, state, verbose, nested + 1) 755 if dir < 0: 756 if lookbehindgroups is None: 757 state.lookbehindgroups = None 758 if not sourcematch(")"): 759 raise source.error("missing ), unterminated subpattern", 760 source.tell() - start) 761 if char == "=": 762 subpatternappend((ASSERT, (dir, p))) 763 else: 764 subpatternappend((ASSERT_NOT, (dir, p))) 765 continue 766 767 elif char == "(": 768 # conditional backreference group 769 condname = source.getuntil(")", "group name") 770 if condname.isidentifier(): 771 condgroup = state.groupdict.get(condname) 772 if condgroup is None: 773 msg = "unknown group name %r" % condname 774 raise source.error(msg, len(condname) + 1) 775 else: 776 try: 777 condgroup = int(condname) 778 if condgroup < 0: 779 raise ValueError 780 except ValueError: 781 msg = "bad character in group name %r" % condname 782 raise source.error(msg, len(condname) + 1) from None 783 if not condgroup: 784 raise source.error("bad group number", 785 len(condname) + 1) 786 if condgroup >= MAXGROUPS: 787 msg = "invalid group reference %d" % condgroup 788 raise source.error(msg, len(condname) + 1) 789 state.checklookbehindgroup(condgroup, source) 790 item_yes = _parse(source, state, verbose, nested + 1) 791 if source.match("|"): 792 item_no = _parse(source, state, verbose, nested + 1) 793 if source.next == "|": 794 raise source.error("conditional backref with more than two branches") 795 else: 796 item_no = None 797 if not source.match(")"): 798 raise source.error("missing ), unterminated subpattern", 799 source.tell() - start) 800 subpatternappend((GROUPREF_EXISTS, (condgroup, item_yes, item_no))) 801 continue 802 803 elif char in FLAGS or char == "-": 804 # flags 805 flags = _parse_flags(source, state, char) 806 if flags is None: # global flags 807 if not first or subpattern: 808 import warnings 809 warnings.warn( 810 'Flags not at the start of the expression %r%s' % ( 811 source.string[:20], # truncate long regexes 812 ' (truncated)' if len(source.string) > 20 else '', 813 ), 814 DeprecationWarning, stacklevel=nested + 6 815 ) 816 if (state.flags & SRE_FLAG_VERBOSE) and not verbose: 817 raise Verbose 818 continue 819 820 add_flags, del_flags = flags 821 group = None 822 else: 823 raise source.error("unknown extension ?" + char, 824 len(char) + 1) 825 826 # parse group contents 827 if group is not None: 828 try: 829 group = state.opengroup(name) 830 except error as err: 831 raise source.error(err.msg, len(name) + 1) from None 832 sub_verbose = ((verbose or (add_flags & SRE_FLAG_VERBOSE)) and 833 not (del_flags & SRE_FLAG_VERBOSE)) 834 p = _parse_sub(source, state, sub_verbose, nested + 1) 835 if not source.match(")"): 836 raise source.error("missing ), unterminated subpattern", 837 source.tell() - start) 838 if group is not None: 839 state.closegroup(group, p) 840 subpatternappend((SUBPATTERN, (group, add_flags, del_flags, p))) 841 842 elif this == "^": 843 subpatternappend((AT, AT_BEGINNING)) 844 845 elif this == "$": 846 subpatternappend((AT, AT_END)) 847 848 else: 849 raise AssertionError("unsupported special character %r" % (char,)) 850 851 # unpack non-capturing groups 852 for i in range(len(subpattern))[::-1]: 853 op, av = subpattern[i] 854 if op is SUBPATTERN: 855 group, add_flags, del_flags, p = av 856 if group is None and not add_flags and not del_flags: 857 subpattern[i: i+1] = p 858 859 return subpattern 860 861 def _parse_flags(source, state, char): 862 sourceget = source.get 863 add_flags = 0 864 del_flags = 0 865 if char != "-": 866 while True: 867 flag = FLAGS[char] 868 if source.istext: 869 if char == 'L': 870 msg = "bad inline flags: cannot use 'L' flag with a str pattern" 871 raise source.error(msg) 872 else: 873 if char == 'u': 874 msg = "bad inline flags: cannot use 'u' flag with a bytes pattern" 875 raise source.error(msg) 876 add_flags |= flag 877 if (flag & TYPE_FLAGS) and (add_flags & TYPE_FLAGS) != flag: 878 msg = "bad inline flags: flags 'a', 'u' and 'L' are incompatible" 879 raise source.error(msg) 880 char = sourceget() 881 if char is None: 882 raise source.error("missing -, : or )") 883 if char in ")-:": 884 break 885 if char not in FLAGS: 886 msg = "unknown flag" if char.isalpha() else "missing -, : or )" 887 raise source.error(msg, len(char)) 888 if char == ")": 889 state.flags |= add_flags 890 return None 891 if add_flags & GLOBAL_FLAGS: 892 raise source.error("bad inline flags: cannot turn on global flag", 1) 893 if char == "-": 894 char = sourceget() 895 if char is None: 896 raise source.error("missing flag") 897 if char not in FLAGS: 898 msg = "unknown flag" if char.isalpha() else "missing flag" 899 raise source.error(msg, len(char)) 900 while True: 901 flag = FLAGS[char] 902 if flag & TYPE_FLAGS: 903 msg = "bad inline flags: cannot turn off flags 'a', 'u' and 'L'" 904 raise source.error(msg) 905 del_flags |= flag 906 char = sourceget() 907 if char is None: 908 raise source.error("missing :") 909 if char == ":": 910 break 911 if char not in FLAGS: 912 msg = "unknown flag" if char.isalpha() else "missing :" 913 raise source.error(msg, len(char)) 914 assert char == ":" 915 if del_flags & GLOBAL_FLAGS: 916 raise source.error("bad inline flags: cannot turn off global flag", 1) 917 if add_flags & del_flags: 918 raise source.error("bad inline flags: flag turned on and off", 1) 919 return add_flags, del_flags 920 921 def fix_flags(src, flags): 922 # Check and fix flags according to the type of pattern (str or bytes) 923 if isinstance(src, str): 924 if flags & SRE_FLAG_LOCALE: 925 raise ValueError("cannot use LOCALE flag with a str pattern") 926 if not flags & SRE_FLAG_ASCII: 927 flags |= SRE_FLAG_UNICODE 928 elif flags & SRE_FLAG_UNICODE: 929 raise ValueError("ASCII and UNICODE flags are incompatible") 930 else: 931 if flags & SRE_FLAG_UNICODE: 932 raise ValueError("cannot use UNICODE flag with a bytes pattern") 933 if flags & SRE_FLAG_LOCALE and flags & SRE_FLAG_ASCII: 934 raise ValueError("ASCII and LOCALE flags are incompatible") 935 return flags 936 937 def parse(str, flags=0, state=None): 938 # parse 're' pattern into list of (opcode, argument) tuples 939 940 source = Tokenizer(str) 941 942 if state is None: 943 state = State() 944 state.flags = flags 945 state.str = str 946 947 try: 948 p = _parse_sub(source, state, flags & SRE_FLAG_VERBOSE, 0) 949 except Verbose: 950 # the VERBOSE flag was switched on inside the pattern. to be 951 # on the safe side, we'll parse the whole thing again... 952 state = State() 953 state.flags = flags | SRE_FLAG_VERBOSE 954 state.str = str 955 source.seek(0) 956 p = _parse_sub(source, state, True, 0) 957 958 p.state.flags = fix_flags(str, p.state.flags) 959 960 if source.next is not None: 961 assert source.next == ")" 962 raise source.error("unbalanced parenthesis") 963 964 if flags & SRE_FLAG_DEBUG: 965 p.dump() 966 967 return p 968 969 def parse_template(source, state): 970 # parse 're' replacement string into list of literals and 971 # group references 972 s = Tokenizer(source) 973 sget = s.get 974 groups = [] 975 literals = [] 976 literal = [] 977 lappend = literal.append 978 def addgroup(index, pos): 979 if index > state.groups: 980 raise s.error("invalid group reference %d" % index, pos) 981 if literal: 982 literals.append(''.join(literal)) 983 del literal[:] 984 groups.append((len(literals), index)) 985 literals.append(None) 986 groupindex = state.groupindex 987 while True: 988 this = sget() 989 if this is None: 990 break # end of replacement string 991 if this[0] == "\\": 992 # group 993 c = this[1] 994 if c == "g": 995 name = "" 996 if not s.match("<"): 997 raise s.error("missing <") 998 name = s.getuntil(">", "group name") 999 if name.isidentifier(): 1000 try: 1001 index = groupindex[name] 1002 except KeyError: 1003 raise IndexError("unknown group name %r" % name) 1004 else: 1005 try: 1006 index = int(name) 1007 if index < 0: 1008 raise ValueError 1009 except ValueError: 1010 raise s.error("bad character in group name %r" % name, 1011 len(name) + 1) from None 1012 if index >= MAXGROUPS: 1013 raise s.error("invalid group reference %d" % index, 1014 len(name) + 1) 1015 addgroup(index, len(name) + 1) 1016 elif c == "0": 1017 if s.next in OCTDIGITS: 1018 this += sget() 1019 if s.next in OCTDIGITS: 1020 this += sget() 1021 lappend(chr(int(this[1:], 8) & 0xff)) 1022 elif c in DIGITS: 1023 isoctal = False 1024 if s.next in DIGITS: 1025 this += sget() 1026 if (c in OCTDIGITS and this[2] in OCTDIGITS and 1027 s.next in OCTDIGITS): 1028 this += sget() 1029 isoctal = True 1030 c = int(this[1:], 8) 1031 if c > 0o377: 1032 raise s.error('octal escape value %s outside of ' 1033 'range 0-0o377' % this, len(this)) 1034 lappend(chr(c)) 1035 if not isoctal: 1036 addgroup(int(this[1:]), len(this) - 1) 1037 else: 1038 try: 1039 this = chr(ESCAPES[this][1]) 1040 except KeyError: 1041 if c in ASCIILETTERS: 1042 raise s.error('bad escape %s' % this, len(this)) 1043 lappend(this) 1044 else: 1045 lappend(this) 1046 if literal: 1047 literals.append(''.join(literal)) 1048 if not isinstance(source, str): 1049 # The tokenizer implicitly decodes bytes objects as latin-1, we must 1050 # therefore re-encode the final representation. 1051 literals = [None if s is None else s.encode('latin-1') for s in literals] 1052 return groups, literals 1053 1054 def expand_template(template, match): 1055 g = match.group 1056 empty = match.string[:0] 1057 groups, literals = template 1058 literals = literals[:] 1059 try: 1060 for index, group in groups: 1061 literals[index] = g(group) or empty 1062 except IndexError: 1063 raise error("invalid group reference %d" % index) 1064 return empty.join(literals) 1065