1# 2# Secret Labs' Regular Expression Engine 3# 4# convert template to internal format 5# 6# Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved. 7# 8# See the __init__.py file for information on usage and redistribution. 9# 10 11"""Internal support module for sre""" 12 13import _sre 14from . import _parser 15from ._constants import * 16from ._casefix import _EXTRA_CASES 17 18assert _sre.MAGIC == MAGIC, "SRE module mismatch" 19 20_LITERAL_CODES = {LITERAL, NOT_LITERAL} 21_SUCCESS_CODES = {SUCCESS, FAILURE} 22_ASSERT_CODES = {ASSERT, ASSERT_NOT} 23_UNIT_CODES = _LITERAL_CODES | {ANY, IN} 24 25_REPEATING_CODES = { 26 MIN_REPEAT: (REPEAT, MIN_UNTIL, MIN_REPEAT_ONE), 27 MAX_REPEAT: (REPEAT, MAX_UNTIL, REPEAT_ONE), 28 POSSESSIVE_REPEAT: (POSSESSIVE_REPEAT, SUCCESS, POSSESSIVE_REPEAT_ONE), 29} 30 31def _combine_flags(flags, add_flags, del_flags, 32 TYPE_FLAGS=_parser.TYPE_FLAGS): 33 if add_flags & TYPE_FLAGS: 34 flags &= ~TYPE_FLAGS 35 return (flags | add_flags) & ~del_flags 36 37def _compile(code, pattern, flags): 38 # internal: compile a (sub)pattern 39 emit = code.append 40 _len = len 41 LITERAL_CODES = _LITERAL_CODES 42 REPEATING_CODES = _REPEATING_CODES 43 SUCCESS_CODES = _SUCCESS_CODES 44 ASSERT_CODES = _ASSERT_CODES 45 iscased = None 46 tolower = None 47 fixes = None 48 if flags & SRE_FLAG_IGNORECASE and not flags & SRE_FLAG_LOCALE: 49 if flags & SRE_FLAG_UNICODE: 50 iscased = _sre.unicode_iscased 51 tolower = _sre.unicode_tolower 52 fixes = _EXTRA_CASES 53 else: 54 iscased = _sre.ascii_iscased 55 tolower = _sre.ascii_tolower 56 for op, av in pattern: 57 if op in LITERAL_CODES: 58 if not flags & SRE_FLAG_IGNORECASE: 59 emit(op) 60 emit(av) 61 elif flags & SRE_FLAG_LOCALE: 62 emit(OP_LOCALE_IGNORE[op]) 63 emit(av) 64 elif not iscased(av): 65 emit(op) 66 emit(av) 67 else: 68 lo = tolower(av) 69 if not fixes: # ascii 70 emit(OP_IGNORE[op]) 71 emit(lo) 72 elif lo not in fixes: 73 emit(OP_UNICODE_IGNORE[op]) 74 emit(lo) 75 else: 76 emit(IN_UNI_IGNORE) 77 skip = _len(code); emit(0) 78 if op is NOT_LITERAL: 79 emit(NEGATE) 80 for k in (lo,) + fixes[lo]: 81 emit(LITERAL) 82 emit(k) 83 emit(FAILURE) 84 code[skip] = _len(code) - skip 85 elif op is IN: 86 charset, hascased = _optimize_charset(av, iscased, tolower, fixes) 87 if flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE: 88 emit(IN_LOC_IGNORE) 89 elif not hascased: 90 emit(IN) 91 elif not fixes: # ascii 92 emit(IN_IGNORE) 93 else: 94 emit(IN_UNI_IGNORE) 95 skip = _len(code); emit(0) 96 _compile_charset(charset, flags, code) 97 code[skip] = _len(code) - skip 98 elif op is ANY: 99 if flags & SRE_FLAG_DOTALL: 100 emit(ANY_ALL) 101 else: 102 emit(ANY) 103 elif op in REPEATING_CODES: 104 if _simple(av[2]): 105 emit(REPEATING_CODES[op][2]) 106 skip = _len(code); emit(0) 107 emit(av[0]) 108 emit(av[1]) 109 _compile(code, av[2], flags) 110 emit(SUCCESS) 111 code[skip] = _len(code) - skip 112 else: 113 emit(REPEATING_CODES[op][0]) 114 skip = _len(code); emit(0) 115 emit(av[0]) 116 emit(av[1]) 117 _compile(code, av[2], flags) 118 code[skip] = _len(code) - skip 119 emit(REPEATING_CODES[op][1]) 120 elif op is SUBPATTERN: 121 group, add_flags, del_flags, p = av 122 if group: 123 emit(MARK) 124 emit((group-1)*2) 125 # _compile_info(code, p, _combine_flags(flags, add_flags, del_flags)) 126 _compile(code, p, _combine_flags(flags, add_flags, del_flags)) 127 if group: 128 emit(MARK) 129 emit((group-1)*2+1) 130 elif op is ATOMIC_GROUP: 131 # Atomic Groups are handled by starting with an Atomic 132 # Group op code, then putting in the atomic group pattern 133 # and finally a success op code to tell any repeat 134 # operations within the Atomic Group to stop eating and 135 # pop their stack if they reach it 136 emit(ATOMIC_GROUP) 137 skip = _len(code); emit(0) 138 _compile(code, av, flags) 139 emit(SUCCESS) 140 code[skip] = _len(code) - skip 141 elif op in SUCCESS_CODES: 142 emit(op) 143 elif op in ASSERT_CODES: 144 emit(op) 145 skip = _len(code); emit(0) 146 if av[0] >= 0: 147 emit(0) # look ahead 148 else: 149 lo, hi = av[1].getwidth() 150 if lo > MAXCODE: 151 raise error("looks too much behind") 152 if lo != hi: 153 raise PatternError("look-behind requires fixed-width pattern") 154 emit(lo) # look behind 155 _compile(code, av[1], flags) 156 emit(SUCCESS) 157 code[skip] = _len(code) - skip 158 elif op is AT: 159 emit(op) 160 if flags & SRE_FLAG_MULTILINE: 161 av = AT_MULTILINE.get(av, av) 162 if flags & SRE_FLAG_LOCALE: 163 av = AT_LOCALE.get(av, av) 164 elif flags & SRE_FLAG_UNICODE: 165 av = AT_UNICODE.get(av, av) 166 emit(av) 167 elif op is BRANCH: 168 emit(op) 169 tail = [] 170 tailappend = tail.append 171 for av in av[1]: 172 skip = _len(code); emit(0) 173 # _compile_info(code, av, flags) 174 _compile(code, av, flags) 175 emit(JUMP) 176 tailappend(_len(code)); emit(0) 177 code[skip] = _len(code) - skip 178 emit(FAILURE) # end of branch 179 for tail in tail: 180 code[tail] = _len(code) - tail 181 elif op is CATEGORY: 182 emit(op) 183 if flags & SRE_FLAG_LOCALE: 184 av = CH_LOCALE[av] 185 elif flags & SRE_FLAG_UNICODE: 186 av = CH_UNICODE[av] 187 emit(av) 188 elif op is GROUPREF: 189 if not flags & SRE_FLAG_IGNORECASE: 190 emit(op) 191 elif flags & SRE_FLAG_LOCALE: 192 emit(GROUPREF_LOC_IGNORE) 193 elif not fixes: # ascii 194 emit(GROUPREF_IGNORE) 195 else: 196 emit(GROUPREF_UNI_IGNORE) 197 emit(av-1) 198 elif op is GROUPREF_EXISTS: 199 emit(op) 200 emit(av[0]-1) 201 skipyes = _len(code); emit(0) 202 _compile(code, av[1], flags) 203 if av[2]: 204 emit(JUMP) 205 skipno = _len(code); emit(0) 206 code[skipyes] = _len(code) - skipyes + 1 207 _compile(code, av[2], flags) 208 code[skipno] = _len(code) - skipno 209 else: 210 code[skipyes] = _len(code) - skipyes + 1 211 else: 212 raise PatternError(f"internal: unsupported operand type {op!r}") 213 214def _compile_charset(charset, flags, code): 215 # compile charset subprogram 216 emit = code.append 217 for op, av in charset: 218 emit(op) 219 if op is NEGATE: 220 pass 221 elif op is LITERAL: 222 emit(av) 223 elif op is RANGE or op is RANGE_UNI_IGNORE: 224 emit(av[0]) 225 emit(av[1]) 226 elif op is CHARSET: 227 code.extend(av) 228 elif op is BIGCHARSET: 229 code.extend(av) 230 elif op is CATEGORY: 231 if flags & SRE_FLAG_LOCALE: 232 emit(CH_LOCALE[av]) 233 elif flags & SRE_FLAG_UNICODE: 234 emit(CH_UNICODE[av]) 235 else: 236 emit(av) 237 else: 238 raise PatternError(f"internal: unsupported set operator {op!r}") 239 emit(FAILURE) 240 241def _optimize_charset(charset, iscased=None, fixup=None, fixes=None): 242 # internal: optimize character set 243 out = [] 244 tail = [] 245 charmap = bytearray(256) 246 hascased = False 247 for op, av in charset: 248 while True: 249 try: 250 if op is LITERAL: 251 if fixup: # IGNORECASE and not LOCALE 252 av = fixup(av) 253 charmap[av] = 1 254 if fixes and av in fixes: 255 for k in fixes[av]: 256 charmap[k] = 1 257 if not hascased and iscased(av): 258 hascased = True 259 else: 260 charmap[av] = 1 261 elif op is RANGE: 262 r = range(av[0], av[1]+1) 263 if fixup: # IGNORECASE and not LOCALE 264 if fixes: 265 for i in map(fixup, r): 266 charmap[i] = 1 267 if i in fixes: 268 for k in fixes[i]: 269 charmap[k] = 1 270 else: 271 for i in map(fixup, r): 272 charmap[i] = 1 273 if not hascased: 274 hascased = any(map(iscased, r)) 275 else: 276 for i in r: 277 charmap[i] = 1 278 elif op is NEGATE: 279 out.append((op, av)) 280 else: 281 tail.append((op, av)) 282 except IndexError: 283 if len(charmap) == 256: 284 # character set contains non-UCS1 character codes 285 charmap += b'\0' * 0xff00 286 continue 287 # Character set contains non-BMP character codes. 288 # For range, all BMP characters in the range are already 289 # proceeded. 290 if fixup: # IGNORECASE and not LOCALE 291 # For now, IN_UNI_IGNORE+LITERAL and 292 # IN_UNI_IGNORE+RANGE_UNI_IGNORE work for all non-BMP 293 # characters, because two characters (at least one of 294 # which is not in the BMP) match case-insensitively 295 # if and only if: 296 # 1) c1.lower() == c2.lower() 297 # 2) c1.lower() == c2 or c1.lower().upper() == c2 298 # Also, both c.lower() and c.lower().upper() are single 299 # characters for every non-BMP character. 300 if op is RANGE: 301 if fixes: # not ASCII 302 op = RANGE_UNI_IGNORE 303 hascased = True 304 else: 305 assert op is LITERAL 306 if not hascased and iscased(av): 307 hascased = True 308 tail.append((op, av)) 309 break 310 311 # compress character map 312 runs = [] 313 q = 0 314 while True: 315 p = charmap.find(1, q) 316 if p < 0: 317 break 318 if len(runs) >= 2: 319 runs = None 320 break 321 q = charmap.find(0, p) 322 if q < 0: 323 runs.append((p, len(charmap))) 324 break 325 runs.append((p, q)) 326 if runs is not None: 327 # use literal/range 328 for p, q in runs: 329 if q - p == 1: 330 out.append((LITERAL, p)) 331 else: 332 out.append((RANGE, (p, q - 1))) 333 out += tail 334 # if the case was changed or new representation is more compact 335 if hascased or len(out) < len(charset): 336 return out, hascased 337 # else original character set is good enough 338 return charset, hascased 339 340 # use bitmap 341 if len(charmap) == 256: 342 data = _mk_bitmap(charmap) 343 out.append((CHARSET, data)) 344 out += tail 345 return out, hascased 346 347 # To represent a big charset, first a bitmap of all characters in the 348 # set is constructed. Then, this bitmap is sliced into chunks of 256 349 # characters, duplicate chunks are eliminated, and each chunk is 350 # given a number. In the compiled expression, the charset is 351 # represented by a 32-bit word sequence, consisting of one word for 352 # the number of different chunks, a sequence of 256 bytes (64 words) 353 # of chunk numbers indexed by their original chunk position, and a 354 # sequence of 256-bit chunks (8 words each). 355 356 # Compression is normally good: in a typical charset, large ranges of 357 # Unicode will be either completely excluded (e.g. if only cyrillic 358 # letters are to be matched), or completely included (e.g. if large 359 # subranges of Kanji match). These ranges will be represented by 360 # chunks of all one-bits or all zero-bits. 361 362 # Matching can be also done efficiently: the more significant byte of 363 # the Unicode character is an index into the chunk number, and the 364 # less significant byte is a bit index in the chunk (just like the 365 # CHARSET matching). 366 367 charmap = bytes(charmap) # should be hashable 368 comps = {} 369 mapping = bytearray(256) 370 block = 0 371 data = bytearray() 372 for i in range(0, 65536, 256): 373 chunk = charmap[i: i + 256] 374 if chunk in comps: 375 mapping[i // 256] = comps[chunk] 376 else: 377 mapping[i // 256] = comps[chunk] = block 378 block += 1 379 data += chunk 380 data = _mk_bitmap(data) 381 data[0:0] = [block] + _bytes_to_codes(mapping) 382 out.append((BIGCHARSET, data)) 383 out += tail 384 return out, hascased 385 386_CODEBITS = _sre.CODESIZE * 8 387MAXCODE = (1 << _CODEBITS) - 1 388_BITS_TRANS = b'0' + b'1' * 255 389def _mk_bitmap(bits, _CODEBITS=_CODEBITS, _int=int): 390 s = bits.translate(_BITS_TRANS)[::-1] 391 return [_int(s[i - _CODEBITS: i], 2) 392 for i in range(len(s), 0, -_CODEBITS)] 393 394def _bytes_to_codes(b): 395 # Convert block indices to word array 396 a = memoryview(b).cast('I') 397 assert a.itemsize == _sre.CODESIZE 398 assert len(a) * a.itemsize == len(b) 399 return a.tolist() 400 401def _simple(p): 402 # check if this subpattern is a "simple" operator 403 if len(p) != 1: 404 return False 405 op, av = p[0] 406 if op is SUBPATTERN: 407 return av[0] is None and _simple(av[-1]) 408 return op in _UNIT_CODES 409 410def _generate_overlap_table(prefix): 411 """ 412 Generate an overlap table for the following prefix. 413 An overlap table is a table of the same size as the prefix which 414 informs about the potential self-overlap for each index in the prefix: 415 - if overlap[i] == 0, prefix[i:] can't overlap prefix[0:...] 416 - if overlap[i] == k with 0 < k <= i, prefix[i-k+1:i+1] overlaps with 417 prefix[0:k] 418 """ 419 table = [0] * len(prefix) 420 for i in range(1, len(prefix)): 421 idx = table[i - 1] 422 while prefix[i] != prefix[idx]: 423 if idx == 0: 424 table[i] = 0 425 break 426 idx = table[idx - 1] 427 else: 428 table[i] = idx + 1 429 return table 430 431def _get_iscased(flags): 432 if not flags & SRE_FLAG_IGNORECASE: 433 return None 434 elif flags & SRE_FLAG_UNICODE: 435 return _sre.unicode_iscased 436 else: 437 return _sre.ascii_iscased 438 439def _get_literal_prefix(pattern, flags): 440 # look for literal prefix 441 prefix = [] 442 prefixappend = prefix.append 443 prefix_skip = None 444 iscased = _get_iscased(flags) 445 for op, av in pattern.data: 446 if op is LITERAL: 447 if iscased and iscased(av): 448 break 449 prefixappend(av) 450 elif op is SUBPATTERN: 451 group, add_flags, del_flags, p = av 452 flags1 = _combine_flags(flags, add_flags, del_flags) 453 if flags1 & SRE_FLAG_IGNORECASE and flags1 & SRE_FLAG_LOCALE: 454 break 455 prefix1, prefix_skip1, got_all = _get_literal_prefix(p, flags1) 456 if prefix_skip is None: 457 if group is not None: 458 prefix_skip = len(prefix) 459 elif prefix_skip1 is not None: 460 prefix_skip = len(prefix) + prefix_skip1 461 prefix.extend(prefix1) 462 if not got_all: 463 break 464 else: 465 break 466 else: 467 return prefix, prefix_skip, True 468 return prefix, prefix_skip, False 469 470def _get_charset_prefix(pattern, flags): 471 while True: 472 if not pattern.data: 473 return None 474 op, av = pattern.data[0] 475 if op is not SUBPATTERN: 476 break 477 group, add_flags, del_flags, pattern = av 478 flags = _combine_flags(flags, add_flags, del_flags) 479 if flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE: 480 return None 481 482 iscased = _get_iscased(flags) 483 if op is LITERAL: 484 if iscased and iscased(av): 485 return None 486 return [(op, av)] 487 elif op is BRANCH: 488 charset = [] 489 charsetappend = charset.append 490 for p in av[1]: 491 if not p: 492 return None 493 op, av = p[0] 494 if op is LITERAL and not (iscased and iscased(av)): 495 charsetappend((op, av)) 496 else: 497 return None 498 return charset 499 elif op is IN: 500 charset = av 501 if iscased: 502 for op, av in charset: 503 if op is LITERAL: 504 if iscased(av): 505 return None 506 elif op is RANGE: 507 if av[1] > 0xffff: 508 return None 509 if any(map(iscased, range(av[0], av[1]+1))): 510 return None 511 return charset 512 return None 513 514def _compile_info(code, pattern, flags): 515 # internal: compile an info block. in the current version, 516 # this contains min/max pattern width, and an optional literal 517 # prefix or a character map 518 lo, hi = pattern.getwidth() 519 if hi > MAXCODE: 520 hi = MAXCODE 521 if lo == 0: 522 code.extend([INFO, 4, 0, lo, hi]) 523 return 524 # look for a literal prefix 525 prefix = [] 526 prefix_skip = 0 527 charset = [] # not used 528 if not (flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE): 529 # look for literal prefix 530 prefix, prefix_skip, got_all = _get_literal_prefix(pattern, flags) 531 # if no prefix, look for charset prefix 532 if not prefix: 533 charset = _get_charset_prefix(pattern, flags) 534## if prefix: 535## print("*** PREFIX", prefix, prefix_skip) 536## if charset: 537## print("*** CHARSET", charset) 538 # add an info block 539 emit = code.append 540 emit(INFO) 541 skip = len(code); emit(0) 542 # literal flag 543 mask = 0 544 if prefix: 545 mask = SRE_INFO_PREFIX 546 if prefix_skip is None and got_all: 547 mask = mask | SRE_INFO_LITERAL 548 elif charset: 549 mask = mask | SRE_INFO_CHARSET 550 emit(mask) 551 # pattern length 552 if lo < MAXCODE: 553 emit(lo) 554 else: 555 emit(MAXCODE) 556 prefix = prefix[:MAXCODE] 557 emit(hi) 558 # add literal prefix 559 if prefix: 560 emit(len(prefix)) # length 561 if prefix_skip is None: 562 prefix_skip = len(prefix) 563 emit(prefix_skip) # skip 564 code.extend(prefix) 565 # generate overlap table 566 code.extend(_generate_overlap_table(prefix)) 567 elif charset: 568 charset, hascased = _optimize_charset(charset) 569 assert not hascased 570 _compile_charset(charset, flags, code) 571 code[skip] = len(code) - skip 572 573def isstring(obj): 574 return isinstance(obj, (str, bytes)) 575 576def _code(p, flags): 577 578 flags = p.state.flags | flags 579 code = [] 580 581 # compile info block 582 _compile_info(code, p, flags) 583 584 # compile the pattern 585 _compile(code, p.data, flags) 586 587 code.append(SUCCESS) 588 589 return code 590 591def _hex_code(code): 592 return '[%s]' % ', '.join('%#0*x' % (_sre.CODESIZE*2+2, x) for x in code) 593 594def dis(code): 595 import sys 596 597 labels = set() 598 level = 0 599 offset_width = len(str(len(code) - 1)) 600 601 def dis_(start, end): 602 def print_(*args, to=None): 603 if to is not None: 604 labels.add(to) 605 args += ('(to %d)' % (to,),) 606 print('%*d%s ' % (offset_width, start, ':' if start in labels else '.'), 607 end=' '*(level-1)) 608 print(*args) 609 610 def print_2(*args): 611 print(end=' '*(offset_width + 2*level)) 612 print(*args) 613 614 nonlocal level 615 level += 1 616 i = start 617 while i < end: 618 start = i 619 op = code[i] 620 i += 1 621 op = OPCODES[op] 622 if op in (SUCCESS, FAILURE, ANY, ANY_ALL, 623 MAX_UNTIL, MIN_UNTIL, NEGATE): 624 print_(op) 625 elif op in (LITERAL, NOT_LITERAL, 626 LITERAL_IGNORE, NOT_LITERAL_IGNORE, 627 LITERAL_UNI_IGNORE, NOT_LITERAL_UNI_IGNORE, 628 LITERAL_LOC_IGNORE, NOT_LITERAL_LOC_IGNORE): 629 arg = code[i] 630 i += 1 631 print_(op, '%#02x (%r)' % (arg, chr(arg))) 632 elif op is AT: 633 arg = code[i] 634 i += 1 635 arg = str(ATCODES[arg]) 636 assert arg[:3] == 'AT_' 637 print_(op, arg[3:]) 638 elif op is CATEGORY: 639 arg = code[i] 640 i += 1 641 arg = str(CHCODES[arg]) 642 assert arg[:9] == 'CATEGORY_' 643 print_(op, arg[9:]) 644 elif op in (IN, IN_IGNORE, IN_UNI_IGNORE, IN_LOC_IGNORE): 645 skip = code[i] 646 print_(op, skip, to=i+skip) 647 dis_(i+1, i+skip) 648 i += skip 649 elif op in (RANGE, RANGE_UNI_IGNORE): 650 lo, hi = code[i: i+2] 651 i += 2 652 print_(op, '%#02x %#02x (%r-%r)' % (lo, hi, chr(lo), chr(hi))) 653 elif op is CHARSET: 654 print_(op, _hex_code(code[i: i + 256//_CODEBITS])) 655 i += 256//_CODEBITS 656 elif op is BIGCHARSET: 657 arg = code[i] 658 i += 1 659 mapping = list(b''.join(x.to_bytes(_sre.CODESIZE, sys.byteorder) 660 for x in code[i: i + 256//_sre.CODESIZE])) 661 print_(op, arg, mapping) 662 i += 256//_sre.CODESIZE 663 level += 1 664 for j in range(arg): 665 print_2(_hex_code(code[i: i + 256//_CODEBITS])) 666 i += 256//_CODEBITS 667 level -= 1 668 elif op in (MARK, GROUPREF, GROUPREF_IGNORE, GROUPREF_UNI_IGNORE, 669 GROUPREF_LOC_IGNORE): 670 arg = code[i] 671 i += 1 672 print_(op, arg) 673 elif op is JUMP: 674 skip = code[i] 675 print_(op, skip, to=i+skip) 676 i += 1 677 elif op is BRANCH: 678 skip = code[i] 679 print_(op, skip, to=i+skip) 680 while skip: 681 dis_(i+1, i+skip) 682 i += skip 683 start = i 684 skip = code[i] 685 if skip: 686 print_('branch', skip, to=i+skip) 687 else: 688 print_(FAILURE) 689 i += 1 690 elif op in (REPEAT, REPEAT_ONE, MIN_REPEAT_ONE, 691 POSSESSIVE_REPEAT, POSSESSIVE_REPEAT_ONE): 692 skip, min, max = code[i: i+3] 693 if max == MAXREPEAT: 694 max = 'MAXREPEAT' 695 print_(op, skip, min, max, to=i+skip) 696 dis_(i+3, i+skip) 697 i += skip 698 elif op is GROUPREF_EXISTS: 699 arg, skip = code[i: i+2] 700 print_(op, arg, skip, to=i+skip) 701 i += 2 702 elif op in (ASSERT, ASSERT_NOT): 703 skip, arg = code[i: i+2] 704 print_(op, skip, arg, to=i+skip) 705 dis_(i+2, i+skip) 706 i += skip 707 elif op is ATOMIC_GROUP: 708 skip = code[i] 709 print_(op, skip, to=i+skip) 710 dis_(i+1, i+skip) 711 i += skip 712 elif op is INFO: 713 skip, flags, min, max = code[i: i+4] 714 if max == MAXREPEAT: 715 max = 'MAXREPEAT' 716 print_(op, skip, bin(flags), min, max, to=i+skip) 717 start = i+4 718 if flags & SRE_INFO_PREFIX: 719 prefix_len, prefix_skip = code[i+4: i+6] 720 print_2(' prefix_skip', prefix_skip) 721 start = i + 6 722 prefix = code[start: start+prefix_len] 723 print_2(' prefix', 724 '[%s]' % ', '.join('%#02x' % x for x in prefix), 725 '(%r)' % ''.join(map(chr, prefix))) 726 start += prefix_len 727 print_2(' overlap', code[start: start+prefix_len]) 728 start += prefix_len 729 if flags & SRE_INFO_CHARSET: 730 level += 1 731 print_2('in') 732 dis_(start, i+skip) 733 level -= 1 734 i += skip 735 else: 736 raise ValueError(op) 737 738 level -= 1 739 740 dis_(0, len(code)) 741 742 743def compile(p, flags=0): 744 # internal: convert pattern list to internal format 745 746 if isstring(p): 747 pattern = p 748 p = _parser.parse(p, flags) 749 else: 750 pattern = None 751 752 code = _code(p, flags) 753 754 if flags & SRE_FLAG_DEBUG: 755 print() 756 dis(code) 757 758 # map in either direction 759 groupindex = p.state.groupdict 760 indexgroup = [None] * p.state.groups 761 for k, i in groupindex.items(): 762 indexgroup[i] = k 763 764 return _sre.compile( 765 pattern, flags | p.state.flags, code, 766 p.state.groups-1, 767 groupindex, tuple(indexgroup) 768 ) 769