1# 2# Secret Labs' Regular Expression Engine 3# 4# convert re-style regular expression to sre pattern 5# 6# Copyright (c) 1998-2001 by Secret Labs AB. All rights reserved. 7# 8# See the __init__.py file for information on usage and redistribution. 9# 10 11"""Internal support module for sre""" 12 13# XXX: show string offset and offending character for all errors 14 15from ._constants import * 16 17SPECIAL_CHARS = ".\\[{()*+?^$|" 18REPEAT_CHARS = "*+?{" 19 20DIGITS = frozenset("0123456789") 21 22OCTDIGITS = frozenset("01234567") 23HEXDIGITS = frozenset("0123456789abcdefABCDEF") 24ASCIILETTERS = frozenset("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ") 25 26WHITESPACE = frozenset(" \t\n\r\v\f") 27 28_REPEATCODES = frozenset({MIN_REPEAT, MAX_REPEAT, POSSESSIVE_REPEAT}) 29_UNITCODES = frozenset({ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY}) 30 31ESCAPES = { 32 r"\a": (LITERAL, ord("\a")), 33 r"\b": (LITERAL, ord("\b")), 34 r"\f": (LITERAL, ord("\f")), 35 r"\n": (LITERAL, ord("\n")), 36 r"\r": (LITERAL, ord("\r")), 37 r"\t": (LITERAL, ord("\t")), 38 r"\v": (LITERAL, ord("\v")), 39 r"\\": (LITERAL, ord("\\")) 40} 41 42CATEGORIES = { 43 r"\A": (AT, AT_BEGINNING_STRING), # start of string 44 r"\b": (AT, AT_BOUNDARY), 45 r"\B": (AT, AT_NON_BOUNDARY), 46 r"\d": (IN, [(CATEGORY, CATEGORY_DIGIT)]), 47 r"\D": (IN, [(CATEGORY, CATEGORY_NOT_DIGIT)]), 48 r"\s": (IN, [(CATEGORY, CATEGORY_SPACE)]), 49 r"\S": (IN, [(CATEGORY, CATEGORY_NOT_SPACE)]), 50 r"\w": (IN, [(CATEGORY, CATEGORY_WORD)]), 51 r"\W": (IN, [(CATEGORY, CATEGORY_NOT_WORD)]), 52 r"\Z": (AT, AT_END_STRING), # end of string 53} 54 55FLAGS = { 56 # standard flags 57 "i": SRE_FLAG_IGNORECASE, 58 "L": SRE_FLAG_LOCALE, 59 "m": SRE_FLAG_MULTILINE, 60 "s": SRE_FLAG_DOTALL, 61 "x": SRE_FLAG_VERBOSE, 62 # extensions 63 "a": SRE_FLAG_ASCII, 64 "u": SRE_FLAG_UNICODE, 65} 66 67TYPE_FLAGS = SRE_FLAG_ASCII | SRE_FLAG_LOCALE | SRE_FLAG_UNICODE 68GLOBAL_FLAGS = SRE_FLAG_DEBUG 69 70# Maximal value returned by SubPattern.getwidth(). 71# Must be larger than MAXREPEAT, MAXCODE and sys.maxsize. 72MAXWIDTH = 1 << 64 73 74class State: 75 # keeps track of state for parsing 76 def __init__(self): 77 self.flags = 0 78 self.groupdict = {} 79 self.groupwidths = [None] # group 0 80 self.lookbehindgroups = None 81 self.grouprefpos = {} 82 @property 83 def groups(self): 84 return len(self.groupwidths) 85 def opengroup(self, name=None): 86 gid = self.groups 87 self.groupwidths.append(None) 88 if self.groups > MAXGROUPS: 89 raise error("too many groups") 90 if name is not None: 91 ogid = self.groupdict.get(name, None) 92 if ogid is not None: 93 raise error("redefinition of group name %r as group %d; " 94 "was group %d" % (name, gid, ogid)) 95 self.groupdict[name] = gid 96 return gid 97 def closegroup(self, gid, p): 98 self.groupwidths[gid] = p.getwidth() 99 def checkgroup(self, gid): 100 return gid < self.groups and self.groupwidths[gid] is not None 101 102 def checklookbehindgroup(self, gid, source): 103 if self.lookbehindgroups is not None: 104 if not self.checkgroup(gid): 105 raise source.error('cannot refer to an open group') 106 if gid >= self.lookbehindgroups: 107 raise source.error('cannot refer to group defined in the same ' 108 'lookbehind subpattern') 109 110class SubPattern: 111 # a subpattern, in intermediate form 112 def __init__(self, state, data=None): 113 self.state = state 114 if data is None: 115 data = [] 116 self.data = data 117 self.width = None 118 119 def dump(self, level=0): 120 seqtypes = (tuple, list) 121 for op, av in self.data: 122 print(level*" " + str(op), end='') 123 if op is IN: 124 # member sublanguage 125 print() 126 for op, a in av: 127 print((level+1)*" " + str(op), a) 128 elif op is BRANCH: 129 print() 130 for i, a in enumerate(av[1]): 131 if i: 132 print(level*" " + "OR") 133 a.dump(level+1) 134 elif op is GROUPREF_EXISTS: 135 condgroup, item_yes, item_no = av 136 print('', condgroup) 137 item_yes.dump(level+1) 138 if item_no: 139 print(level*" " + "ELSE") 140 item_no.dump(level+1) 141 elif isinstance(av, SubPattern): 142 print() 143 av.dump(level+1) 144 elif isinstance(av, seqtypes): 145 nl = False 146 for a in av: 147 if isinstance(a, SubPattern): 148 if not nl: 149 print() 150 a.dump(level+1) 151 nl = True 152 else: 153 if not nl: 154 print(' ', end='') 155 print(a, end='') 156 nl = False 157 if not nl: 158 print() 159 else: 160 print('', av) 161 def __repr__(self): 162 return repr(self.data) 163 def __len__(self): 164 return len(self.data) 165 def __delitem__(self, index): 166 del self.data[index] 167 def __getitem__(self, index): 168 if isinstance(index, slice): 169 return SubPattern(self.state, self.data[index]) 170 return self.data[index] 171 def __setitem__(self, index, code): 172 self.data[index] = code 173 def insert(self, index, code): 174 self.data.insert(index, code) 175 def append(self, code): 176 self.data.append(code) 177 def getwidth(self): 178 # determine the width (min, max) for this subpattern 179 if self.width is not None: 180 return self.width 181 lo = hi = 0 182 for op, av in self.data: 183 if op is BRANCH: 184 i = MAXWIDTH 185 j = 0 186 for av in av[1]: 187 l, h = av.getwidth() 188 i = min(i, l) 189 j = max(j, h) 190 lo = lo + i 191 hi = hi + j 192 elif op is ATOMIC_GROUP: 193 i, j = av.getwidth() 194 lo = lo + i 195 hi = hi + j 196 elif op is SUBPATTERN: 197 i, j = av[-1].getwidth() 198 lo = lo + i 199 hi = hi + j 200 elif op in _REPEATCODES: 201 i, j = av[2].getwidth() 202 lo = lo + i * av[0] 203 if av[1] == MAXREPEAT and j: 204 hi = MAXWIDTH 205 else: 206 hi = hi + j * av[1] 207 elif op in _UNITCODES: 208 lo = lo + 1 209 hi = hi + 1 210 elif op is GROUPREF: 211 i, j = self.state.groupwidths[av] 212 lo = lo + i 213 hi = hi + j 214 elif op is GROUPREF_EXISTS: 215 i, j = av[1].getwidth() 216 if av[2] is not None: 217 l, h = av[2].getwidth() 218 i = min(i, l) 219 j = max(j, h) 220 else: 221 i = 0 222 lo = lo + i 223 hi = hi + j 224 elif op is SUCCESS: 225 break 226 self.width = min(lo, MAXWIDTH), min(hi, MAXWIDTH) 227 return self.width 228 229class Tokenizer: 230 def __init__(self, string): 231 self.istext = isinstance(string, str) 232 self.string = string 233 if not self.istext: 234 string = str(string, 'latin1') 235 self.decoded_string = string 236 self.index = 0 237 self.next = None 238 self.__next() 239 def __next(self): 240 index = self.index 241 try: 242 char = self.decoded_string[index] 243 except IndexError: 244 self.next = None 245 return 246 if char == "\\": 247 index += 1 248 try: 249 char += self.decoded_string[index] 250 except IndexError: 251 raise error("bad escape (end of pattern)", 252 self.string, len(self.string) - 1) from None 253 self.index = index + 1 254 self.next = char 255 def match(self, char): 256 if char == self.next: 257 self.__next() 258 return True 259 return False 260 def get(self): 261 this = self.next 262 self.__next() 263 return this 264 def getwhile(self, n, charset): 265 result = '' 266 for _ in range(n): 267 c = self.next 268 if c not in charset: 269 break 270 result += c 271 self.__next() 272 return result 273 def getuntil(self, terminator, name): 274 result = '' 275 while True: 276 c = self.next 277 self.__next() 278 if c is None: 279 if not result: 280 raise self.error("missing " + name) 281 raise self.error("missing %s, unterminated name" % terminator, 282 len(result)) 283 if c == terminator: 284 if not result: 285 raise self.error("missing " + name, 1) 286 break 287 result += c 288 return result 289 @property 290 def pos(self): 291 return self.index - len(self.next or '') 292 def tell(self): 293 return self.index - len(self.next or '') 294 def seek(self, index): 295 self.index = index 296 self.__next() 297 298 def error(self, msg, offset=0): 299 if not self.istext: 300 msg = msg.encode('ascii', 'backslashreplace').decode('ascii') 301 return error(msg, self.string, self.tell() - offset) 302 303 def checkgroupname(self, name, offset): 304 if not (self.istext or name.isascii()): 305 msg = "bad character in group name %a" % name 306 raise self.error(msg, len(name) + offset) 307 if not name.isidentifier(): 308 msg = "bad character in group name %r" % name 309 raise self.error(msg, len(name) + offset) 310 311def _class_escape(source, escape): 312 # handle escape code inside character class 313 code = ESCAPES.get(escape) 314 if code: 315 return code 316 code = CATEGORIES.get(escape) 317 if code and code[0] is IN: 318 return code 319 try: 320 c = escape[1:2] 321 if c == "x": 322 # hexadecimal escape (exactly two digits) 323 escape += source.getwhile(2, HEXDIGITS) 324 if len(escape) != 4: 325 raise source.error("incomplete escape %s" % escape, len(escape)) 326 return LITERAL, int(escape[2:], 16) 327 elif c == "u" and source.istext: 328 # unicode escape (exactly four digits) 329 escape += source.getwhile(4, HEXDIGITS) 330 if len(escape) != 6: 331 raise source.error("incomplete escape %s" % escape, len(escape)) 332 return LITERAL, int(escape[2:], 16) 333 elif c == "U" and source.istext: 334 # unicode escape (exactly eight digits) 335 escape += source.getwhile(8, HEXDIGITS) 336 if len(escape) != 10: 337 raise source.error("incomplete escape %s" % escape, len(escape)) 338 c = int(escape[2:], 16) 339 chr(c) # raise ValueError for invalid code 340 return LITERAL, c 341 elif c == "N" and source.istext: 342 import unicodedata 343 # named unicode escape e.g. \N{EM DASH} 344 if not source.match('{'): 345 raise source.error("missing {") 346 charname = source.getuntil('}', 'character name') 347 try: 348 c = ord(unicodedata.lookup(charname)) 349 except (KeyError, TypeError): 350 raise source.error("undefined character name %r" % charname, 351 len(charname) + len(r'\N{}')) from None 352 return LITERAL, c 353 elif c in OCTDIGITS: 354 # octal escape (up to three digits) 355 escape += source.getwhile(2, OCTDIGITS) 356 c = int(escape[1:], 8) 357 if c > 0o377: 358 raise source.error('octal escape value %s outside of ' 359 'range 0-0o377' % escape, len(escape)) 360 return LITERAL, c 361 elif c in DIGITS: 362 raise ValueError 363 if len(escape) == 2: 364 if c in ASCIILETTERS: 365 raise source.error('bad escape %s' % escape, len(escape)) 366 return LITERAL, ord(escape[1]) 367 except ValueError: 368 pass 369 raise source.error("bad escape %s" % escape, len(escape)) 370 371def _escape(source, escape, state): 372 # handle escape code in expression 373 code = CATEGORIES.get(escape) 374 if code: 375 return code 376 code = ESCAPES.get(escape) 377 if code: 378 return code 379 try: 380 c = escape[1:2] 381 if c == "x": 382 # hexadecimal escape 383 escape += source.getwhile(2, HEXDIGITS) 384 if len(escape) != 4: 385 raise source.error("incomplete escape %s" % escape, len(escape)) 386 return LITERAL, int(escape[2:], 16) 387 elif c == "u" and source.istext: 388 # unicode escape (exactly four digits) 389 escape += source.getwhile(4, HEXDIGITS) 390 if len(escape) != 6: 391 raise source.error("incomplete escape %s" % escape, len(escape)) 392 return LITERAL, int(escape[2:], 16) 393 elif c == "U" and source.istext: 394 # unicode escape (exactly eight digits) 395 escape += source.getwhile(8, HEXDIGITS) 396 if len(escape) != 10: 397 raise source.error("incomplete escape %s" % escape, len(escape)) 398 c = int(escape[2:], 16) 399 chr(c) # raise ValueError for invalid code 400 return LITERAL, c 401 elif c == "N" and source.istext: 402 import unicodedata 403 # named unicode escape e.g. \N{EM DASH} 404 if not source.match('{'): 405 raise source.error("missing {") 406 charname = source.getuntil('}', 'character name') 407 try: 408 c = ord(unicodedata.lookup(charname)) 409 except (KeyError, TypeError): 410 raise source.error("undefined character name %r" % charname, 411 len(charname) + len(r'\N{}')) from None 412 return LITERAL, c 413 elif c == "0": 414 # octal escape 415 escape += source.getwhile(2, OCTDIGITS) 416 return LITERAL, int(escape[1:], 8) 417 elif c in DIGITS: 418 # octal escape *or* decimal group reference (sigh) 419 if source.next in DIGITS: 420 escape += source.get() 421 if (escape[1] in OCTDIGITS and escape[2] in OCTDIGITS and 422 source.next in OCTDIGITS): 423 # got three octal digits; this is an octal escape 424 escape += source.get() 425 c = int(escape[1:], 8) 426 if c > 0o377: 427 raise source.error('octal escape value %s outside of ' 428 'range 0-0o377' % escape, 429 len(escape)) 430 return LITERAL, c 431 # not an octal escape, so this is a group reference 432 group = int(escape[1:]) 433 if group < state.groups: 434 if not state.checkgroup(group): 435 raise source.error("cannot refer to an open group", 436 len(escape)) 437 state.checklookbehindgroup(group, source) 438 return GROUPREF, group 439 raise source.error("invalid group reference %d" % group, len(escape) - 1) 440 if len(escape) == 2: 441 if c in ASCIILETTERS: 442 raise source.error("bad escape %s" % escape, len(escape)) 443 return LITERAL, ord(escape[1]) 444 except ValueError: 445 pass 446 raise source.error("bad escape %s" % escape, len(escape)) 447 448def _uniq(items): 449 return list(dict.fromkeys(items)) 450 451def _parse_sub(source, state, verbose, nested): 452 # parse an alternation: a|b|c 453 454 items = [] 455 itemsappend = items.append 456 sourcematch = source.match 457 start = source.tell() 458 while True: 459 itemsappend(_parse(source, state, verbose, nested + 1, 460 not nested and not items)) 461 if not sourcematch("|"): 462 break 463 if not nested: 464 verbose = state.flags & SRE_FLAG_VERBOSE 465 466 if len(items) == 1: 467 return items[0] 468 469 subpattern = SubPattern(state) 470 471 # check if all items share a common prefix 472 while True: 473 prefix = None 474 for item in items: 475 if not item: 476 break 477 if prefix is None: 478 prefix = item[0] 479 elif item[0] != prefix: 480 break 481 else: 482 # all subitems start with a common "prefix". 483 # move it out of the branch 484 for item in items: 485 del item[0] 486 subpattern.append(prefix) 487 continue # check next one 488 break 489 490 # check if the branch can be replaced by a character set 491 set = [] 492 for item in items: 493 if len(item) != 1: 494 break 495 op, av = item[0] 496 if op is LITERAL: 497 set.append((op, av)) 498 elif op is IN and av[0][0] is not NEGATE: 499 set.extend(av) 500 else: 501 break 502 else: 503 # we can store this as a character set instead of a 504 # branch (the compiler may optimize this even more) 505 subpattern.append((IN, _uniq(set))) 506 return subpattern 507 508 subpattern.append((BRANCH, (None, items))) 509 return subpattern 510 511def _parse(source, state, verbose, nested, first=False): 512 # parse a simple pattern 513 subpattern = SubPattern(state) 514 515 # precompute constants into local variables 516 subpatternappend = subpattern.append 517 sourceget = source.get 518 sourcematch = source.match 519 _len = len 520 _ord = ord 521 522 while True: 523 524 this = source.next 525 if this is None: 526 break # end of pattern 527 if this in "|)": 528 break # end of subpattern 529 sourceget() 530 531 if verbose: 532 # skip whitespace and comments 533 if this in WHITESPACE: 534 continue 535 if this == "#": 536 while True: 537 this = sourceget() 538 if this is None or this == "\n": 539 break 540 continue 541 542 if this[0] == "\\": 543 code = _escape(source, this, state) 544 subpatternappend(code) 545 546 elif this not in SPECIAL_CHARS: 547 subpatternappend((LITERAL, _ord(this))) 548 549 elif this == "[": 550 here = source.tell() - 1 551 # character set 552 set = [] 553 setappend = set.append 554## if sourcematch(":"): 555## pass # handle character classes 556 if source.next == '[': 557 import warnings 558 warnings.warn( 559 'Possible nested set at position %d' % source.tell(), 560 FutureWarning, stacklevel=nested + 6 561 ) 562 negate = sourcematch("^") 563 # check remaining characters 564 while True: 565 this = sourceget() 566 if this is None: 567 raise source.error("unterminated character set", 568 source.tell() - here) 569 if this == "]" and set: 570 break 571 elif this[0] == "\\": 572 code1 = _class_escape(source, this) 573 else: 574 if set and this in '-&~|' and source.next == this: 575 import warnings 576 warnings.warn( 577 'Possible set %s at position %d' % ( 578 'difference' if this == '-' else 579 'intersection' if this == '&' else 580 'symmetric difference' if this == '~' else 581 'union', 582 source.tell() - 1), 583 FutureWarning, stacklevel=nested + 6 584 ) 585 code1 = LITERAL, _ord(this) 586 if sourcematch("-"): 587 # potential range 588 that = sourceget() 589 if that is None: 590 raise source.error("unterminated character set", 591 source.tell() - here) 592 if that == "]": 593 if code1[0] is IN: 594 code1 = code1[1][0] 595 setappend(code1) 596 setappend((LITERAL, _ord("-"))) 597 break 598 if that[0] == "\\": 599 code2 = _class_escape(source, that) 600 else: 601 if that == '-': 602 import warnings 603 warnings.warn( 604 'Possible set difference at position %d' % ( 605 source.tell() - 2), 606 FutureWarning, stacklevel=nested + 6 607 ) 608 code2 = LITERAL, _ord(that) 609 if code1[0] != LITERAL or code2[0] != LITERAL: 610 msg = "bad character range %s-%s" % (this, that) 611 raise source.error(msg, len(this) + 1 + len(that)) 612 lo = code1[1] 613 hi = code2[1] 614 if hi < lo: 615 msg = "bad character range %s-%s" % (this, that) 616 raise source.error(msg, len(this) + 1 + len(that)) 617 setappend((RANGE, (lo, hi))) 618 else: 619 if code1[0] is IN: 620 code1 = code1[1][0] 621 setappend(code1) 622 623 set = _uniq(set) 624 # XXX: <fl> should move set optimization to compiler! 625 if _len(set) == 1 and set[0][0] is LITERAL: 626 # optimization 627 if negate: 628 subpatternappend((NOT_LITERAL, set[0][1])) 629 else: 630 subpatternappend(set[0]) 631 else: 632 if negate: 633 set.insert(0, (NEGATE, None)) 634 # charmap optimization can't be added here because 635 # global flags still are not known 636 subpatternappend((IN, set)) 637 638 elif this in REPEAT_CHARS: 639 # repeat previous item 640 here = source.tell() 641 if this == "?": 642 min, max = 0, 1 643 elif this == "*": 644 min, max = 0, MAXREPEAT 645 646 elif this == "+": 647 min, max = 1, MAXREPEAT 648 elif this == "{": 649 if source.next == "}": 650 subpatternappend((LITERAL, _ord(this))) 651 continue 652 653 min, max = 0, MAXREPEAT 654 lo = hi = "" 655 while source.next in DIGITS: 656 lo += sourceget() 657 if sourcematch(","): 658 while source.next in DIGITS: 659 hi += sourceget() 660 else: 661 hi = lo 662 if not sourcematch("}"): 663 subpatternappend((LITERAL, _ord(this))) 664 source.seek(here) 665 continue 666 667 if lo: 668 min = int(lo) 669 if min >= MAXREPEAT: 670 raise OverflowError("the repetition number is too large") 671 if hi: 672 max = int(hi) 673 if max >= MAXREPEAT: 674 raise OverflowError("the repetition number is too large") 675 if max < min: 676 raise source.error("min repeat greater than max repeat", 677 source.tell() - here) 678 else: 679 raise AssertionError("unsupported quantifier %r" % (char,)) 680 # figure out which item to repeat 681 if subpattern: 682 item = subpattern[-1:] 683 else: 684 item = None 685 if not item or item[0][0] is AT: 686 raise source.error("nothing to repeat", 687 source.tell() - here + len(this)) 688 if item[0][0] in _REPEATCODES: 689 raise source.error("multiple repeat", 690 source.tell() - here + len(this)) 691 if item[0][0] is SUBPATTERN: 692 group, add_flags, del_flags, p = item[0][1] 693 if group is None and not add_flags and not del_flags: 694 item = p 695 if sourcematch("?"): 696 # Non-Greedy Match 697 subpattern[-1] = (MIN_REPEAT, (min, max, item)) 698 elif sourcematch("+"): 699 # Possessive Match (Always Greedy) 700 subpattern[-1] = (POSSESSIVE_REPEAT, (min, max, item)) 701 else: 702 # Greedy Match 703 subpattern[-1] = (MAX_REPEAT, (min, max, item)) 704 705 elif this == ".": 706 subpatternappend((ANY, None)) 707 708 elif this == "(": 709 start = source.tell() - 1 710 capture = True 711 atomic = False 712 name = None 713 add_flags = 0 714 del_flags = 0 715 if sourcematch("?"): 716 # options 717 char = sourceget() 718 if char is None: 719 raise source.error("unexpected end of pattern") 720 if char == "P": 721 # python extensions 722 if sourcematch("<"): 723 # named group: skip forward to end of name 724 name = source.getuntil(">", "group name") 725 source.checkgroupname(name, 1) 726 elif sourcematch("="): 727 # named backreference 728 name = source.getuntil(")", "group name") 729 source.checkgroupname(name, 1) 730 gid = state.groupdict.get(name) 731 if gid is None: 732 msg = "unknown group name %r" % name 733 raise source.error(msg, len(name) + 1) 734 if not state.checkgroup(gid): 735 raise source.error("cannot refer to an open group", 736 len(name) + 1) 737 state.checklookbehindgroup(gid, source) 738 subpatternappend((GROUPREF, gid)) 739 continue 740 741 else: 742 char = sourceget() 743 if char is None: 744 raise source.error("unexpected end of pattern") 745 raise source.error("unknown extension ?P" + char, 746 len(char) + 2) 747 elif char == ":": 748 # non-capturing group 749 capture = False 750 elif char == "#": 751 # comment 752 while True: 753 if source.next is None: 754 raise source.error("missing ), unterminated comment", 755 source.tell() - start) 756 if sourceget() == ")": 757 break 758 continue 759 760 elif char in "=!<": 761 # lookahead assertions 762 dir = 1 763 if char == "<": 764 char = sourceget() 765 if char is None: 766 raise source.error("unexpected end of pattern") 767 if char not in "=!": 768 raise source.error("unknown extension ?<" + char, 769 len(char) + 2) 770 dir = -1 # lookbehind 771 lookbehindgroups = state.lookbehindgroups 772 if lookbehindgroups is None: 773 state.lookbehindgroups = state.groups 774 p = _parse_sub(source, state, verbose, nested + 1) 775 if dir < 0: 776 if lookbehindgroups is None: 777 state.lookbehindgroups = None 778 if not sourcematch(")"): 779 raise source.error("missing ), unterminated subpattern", 780 source.tell() - start) 781 if char == "=": 782 subpatternappend((ASSERT, (dir, p))) 783 elif p: 784 subpatternappend((ASSERT_NOT, (dir, p))) 785 else: 786 subpatternappend((FAILURE, ())) 787 continue 788 789 elif char == "(": 790 # conditional backreference group 791 condname = source.getuntil(")", "group name") 792 if not (condname.isdecimal() and condname.isascii()): 793 source.checkgroupname(condname, 1) 794 condgroup = state.groupdict.get(condname) 795 if condgroup is None: 796 msg = "unknown group name %r" % condname 797 raise source.error(msg, len(condname) + 1) 798 else: 799 condgroup = int(condname) 800 if not condgroup: 801 raise source.error("bad group number", 802 len(condname) + 1) 803 if condgroup >= MAXGROUPS: 804 msg = "invalid group reference %d" % condgroup 805 raise source.error(msg, len(condname) + 1) 806 if condgroup not in state.grouprefpos: 807 state.grouprefpos[condgroup] = ( 808 source.tell() - len(condname) - 1 809 ) 810 if not (condname.isdecimal() and condname.isascii()): 811 import warnings 812 warnings.warn( 813 "bad character in group name %s at position %d" % 814 (repr(condname) if source.istext else ascii(condname), 815 source.tell() - len(condname) - 1), 816 DeprecationWarning, stacklevel=nested + 6 817 ) 818 state.checklookbehindgroup(condgroup, source) 819 item_yes = _parse(source, state, verbose, nested + 1) 820 if source.match("|"): 821 item_no = _parse(source, state, verbose, nested + 1) 822 if source.next == "|": 823 raise source.error("conditional backref with more than two branches") 824 else: 825 item_no = None 826 if not source.match(")"): 827 raise source.error("missing ), unterminated subpattern", 828 source.tell() - start) 829 subpatternappend((GROUPREF_EXISTS, (condgroup, item_yes, item_no))) 830 continue 831 832 elif char == ">": 833 # non-capturing, atomic group 834 capture = False 835 atomic = True 836 elif char in FLAGS or char == "-": 837 # flags 838 flags = _parse_flags(source, state, char) 839 if flags is None: # global flags 840 if not first or subpattern: 841 raise source.error('global flags not at the start ' 842 'of the expression', 843 source.tell() - start) 844 verbose = state.flags & SRE_FLAG_VERBOSE 845 continue 846 847 add_flags, del_flags = flags 848 capture = False 849 else: 850 raise source.error("unknown extension ?" + char, 851 len(char) + 1) 852 853 # parse group contents 854 if capture: 855 try: 856 group = state.opengroup(name) 857 except error as err: 858 raise source.error(err.msg, len(name) + 1) from None 859 else: 860 group = None 861 sub_verbose = ((verbose or (add_flags & SRE_FLAG_VERBOSE)) and 862 not (del_flags & SRE_FLAG_VERBOSE)) 863 p = _parse_sub(source, state, sub_verbose, nested + 1) 864 if not source.match(")"): 865 raise source.error("missing ), unterminated subpattern", 866 source.tell() - start) 867 if group is not None: 868 state.closegroup(group, p) 869 if atomic: 870 assert group is None 871 subpatternappend((ATOMIC_GROUP, p)) 872 else: 873 subpatternappend((SUBPATTERN, (group, add_flags, del_flags, p))) 874 875 elif this == "^": 876 subpatternappend((AT, AT_BEGINNING)) 877 878 elif this == "$": 879 subpatternappend((AT, AT_END)) 880 881 else: 882 raise AssertionError("unsupported special character %r" % (char,)) 883 884 # unpack non-capturing groups 885 for i in range(len(subpattern))[::-1]: 886 op, av = subpattern[i] 887 if op is SUBPATTERN: 888 group, add_flags, del_flags, p = av 889 if group is None and not add_flags and not del_flags: 890 subpattern[i: i+1] = p 891 892 return subpattern 893 894def _parse_flags(source, state, char): 895 sourceget = source.get 896 add_flags = 0 897 del_flags = 0 898 if char != "-": 899 while True: 900 flag = FLAGS[char] 901 if source.istext: 902 if char == 'L': 903 msg = "bad inline flags: cannot use 'L' flag with a str pattern" 904 raise source.error(msg) 905 else: 906 if char == 'u': 907 msg = "bad inline flags: cannot use 'u' flag with a bytes pattern" 908 raise source.error(msg) 909 add_flags |= flag 910 if (flag & TYPE_FLAGS) and (add_flags & TYPE_FLAGS) != flag: 911 msg = "bad inline flags: flags 'a', 'u' and 'L' are incompatible" 912 raise source.error(msg) 913 char = sourceget() 914 if char is None: 915 raise source.error("missing -, : or )") 916 if char in ")-:": 917 break 918 if char not in FLAGS: 919 msg = "unknown flag" if char.isalpha() else "missing -, : or )" 920 raise source.error(msg, len(char)) 921 if char == ")": 922 state.flags |= add_flags 923 return None 924 if add_flags & GLOBAL_FLAGS: 925 raise source.error("bad inline flags: cannot turn on global flag", 1) 926 if char == "-": 927 char = sourceget() 928 if char is None: 929 raise source.error("missing flag") 930 if char not in FLAGS: 931 msg = "unknown flag" if char.isalpha() else "missing flag" 932 raise source.error(msg, len(char)) 933 while True: 934 flag = FLAGS[char] 935 if flag & TYPE_FLAGS: 936 msg = "bad inline flags: cannot turn off flags 'a', 'u' and 'L'" 937 raise source.error(msg) 938 del_flags |= flag 939 char = sourceget() 940 if char is None: 941 raise source.error("missing :") 942 if char == ":": 943 break 944 if char not in FLAGS: 945 msg = "unknown flag" if char.isalpha() else "missing :" 946 raise source.error(msg, len(char)) 947 assert char == ":" 948 if del_flags & GLOBAL_FLAGS: 949 raise source.error("bad inline flags: cannot turn off global flag", 1) 950 if add_flags & del_flags: 951 raise source.error("bad inline flags: flag turned on and off", 1) 952 return add_flags, del_flags 953 954def fix_flags(src, flags): 955 # Check and fix flags according to the type of pattern (str or bytes) 956 if isinstance(src, str): 957 if flags & SRE_FLAG_LOCALE: 958 raise ValueError("cannot use LOCALE flag with a str pattern") 959 if not flags & SRE_FLAG_ASCII: 960 flags |= SRE_FLAG_UNICODE 961 elif flags & SRE_FLAG_UNICODE: 962 raise ValueError("ASCII and UNICODE flags are incompatible") 963 else: 964 if flags & SRE_FLAG_UNICODE: 965 raise ValueError("cannot use UNICODE flag with a bytes pattern") 966 if flags & SRE_FLAG_LOCALE and flags & SRE_FLAG_ASCII: 967 raise ValueError("ASCII and LOCALE flags are incompatible") 968 return flags 969 970def parse(str, flags=0, state=None): 971 # parse 're' pattern into list of (opcode, argument) tuples 972 973 source = Tokenizer(str) 974 975 if state is None: 976 state = State() 977 state.flags = flags 978 state.str = str 979 980 p = _parse_sub(source, state, flags & SRE_FLAG_VERBOSE, 0) 981 p.state.flags = fix_flags(str, p.state.flags) 982 983 if source.next is not None: 984 assert source.next == ")" 985 raise source.error("unbalanced parenthesis") 986 987 for g in p.state.grouprefpos: 988 if g >= p.state.groups: 989 msg = "invalid group reference %d" % g 990 raise error(msg, str, p.state.grouprefpos[g]) 991 992 if flags & SRE_FLAG_DEBUG: 993 p.dump() 994 995 return p 996 997def parse_template(source, pattern): 998 # parse 're' replacement string into list of literals and 999 # group references 1000 s = Tokenizer(source) 1001 sget = s.get 1002 result = [] 1003 literal = [] 1004 lappend = literal.append 1005 def addliteral(): 1006 if s.istext: 1007 result.append(''.join(literal)) 1008 else: 1009 # The tokenizer implicitly decodes bytes objects as latin-1, we must 1010 # therefore re-encode the final representation. 1011 result.append(''.join(literal).encode('latin-1')) 1012 del literal[:] 1013 def addgroup(index, pos): 1014 if index > pattern.groups: 1015 raise s.error("invalid group reference %d" % index, pos) 1016 addliteral() 1017 result.append(index) 1018 groupindex = pattern.groupindex 1019 while True: 1020 this = sget() 1021 if this is None: 1022 break # end of replacement string 1023 if this[0] == "\\": 1024 # group 1025 c = this[1] 1026 if c == "g": 1027 if not s.match("<"): 1028 raise s.error("missing <") 1029 name = s.getuntil(">", "group name") 1030 if not (name.isdecimal() and name.isascii()): 1031 s.checkgroupname(name, 1) 1032 try: 1033 index = groupindex[name] 1034 except KeyError: 1035 raise IndexError("unknown group name %r" % name) from None 1036 else: 1037 index = int(name) 1038 if index >= MAXGROUPS: 1039 raise s.error("invalid group reference %d" % index, 1040 len(name) + 1) 1041 if not (name.isdecimal() and name.isascii()): 1042 import warnings 1043 warnings.warn( 1044 "bad character in group name %s at position %d" % 1045 (repr(name) if s.istext else ascii(name), 1046 s.tell() - len(name) - 1), 1047 DeprecationWarning, stacklevel=5 1048 ) 1049 addgroup(index, len(name) + 1) 1050 elif c == "0": 1051 if s.next in OCTDIGITS: 1052 this += sget() 1053 if s.next in OCTDIGITS: 1054 this += sget() 1055 lappend(chr(int(this[1:], 8) & 0xff)) 1056 elif c in DIGITS: 1057 isoctal = False 1058 if s.next in DIGITS: 1059 this += sget() 1060 if (c in OCTDIGITS and this[2] in OCTDIGITS and 1061 s.next in OCTDIGITS): 1062 this += sget() 1063 isoctal = True 1064 c = int(this[1:], 8) 1065 if c > 0o377: 1066 raise s.error('octal escape value %s outside of ' 1067 'range 0-0o377' % this, len(this)) 1068 lappend(chr(c)) 1069 if not isoctal: 1070 addgroup(int(this[1:]), len(this) - 1) 1071 else: 1072 try: 1073 this = chr(ESCAPES[this][1]) 1074 except KeyError: 1075 if c in ASCIILETTERS: 1076 raise s.error('bad escape %s' % this, len(this)) from None 1077 lappend(this) 1078 else: 1079 lappend(this) 1080 addliteral() 1081 return result 1082