1# 2# Secret Labs' Regular Expression Engine 3# 4# convert template to internal format 5# 6# Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved. 7# 8# See the sre.py file for information on usage and redistribution. 9# 10 11"""Internal support module for sre""" 12 13import _sre 14import sre_parse 15from sre_constants import * 16 17assert _sre.MAGIC == MAGIC, "SRE module mismatch" 18 19_LITERAL_CODES = {LITERAL, NOT_LITERAL} 20_REPEATING_CODES = {REPEAT, MIN_REPEAT, MAX_REPEAT} 21_SUCCESS_CODES = {SUCCESS, FAILURE} 22_ASSERT_CODES = {ASSERT, ASSERT_NOT} 23_UNIT_CODES = _LITERAL_CODES | {ANY, IN} 24 25# Sets of lowercase characters which have the same uppercase. 26_equivalences = ( 27 # LATIN SMALL LETTER I, LATIN SMALL LETTER DOTLESS I 28 (0x69, 0x131), # iı 29 # LATIN SMALL LETTER S, LATIN SMALL LETTER LONG S 30 (0x73, 0x17f), # sſ 31 # MICRO SIGN, GREEK SMALL LETTER MU 32 (0xb5, 0x3bc), # µμ 33 # COMBINING GREEK YPOGEGRAMMENI, GREEK SMALL LETTER IOTA, GREEK PROSGEGRAMMENI 34 (0x345, 0x3b9, 0x1fbe), # \u0345ιι 35 # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS, GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA 36 (0x390, 0x1fd3), # ΐΐ 37 # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS, GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA 38 (0x3b0, 0x1fe3), # ΰΰ 39 # GREEK SMALL LETTER BETA, GREEK BETA SYMBOL 40 (0x3b2, 0x3d0), # βϐ 41 # GREEK SMALL LETTER EPSILON, GREEK LUNATE EPSILON SYMBOL 42 (0x3b5, 0x3f5), # εϵ 43 # GREEK SMALL LETTER THETA, GREEK THETA SYMBOL 44 (0x3b8, 0x3d1), # θϑ 45 # GREEK SMALL LETTER KAPPA, GREEK KAPPA SYMBOL 46 (0x3ba, 0x3f0), # κϰ 47 # GREEK SMALL LETTER PI, GREEK PI SYMBOL 48 (0x3c0, 0x3d6), # πϖ 49 # GREEK SMALL LETTER RHO, GREEK RHO SYMBOL 50 (0x3c1, 0x3f1), # ρϱ 51 # GREEK SMALL LETTER FINAL SIGMA, GREEK SMALL LETTER SIGMA 52 (0x3c2, 0x3c3), # ςσ 53 # GREEK SMALL LETTER PHI, GREEK PHI SYMBOL 54 (0x3c6, 0x3d5), # φϕ 55 # LATIN SMALL LETTER S WITH DOT ABOVE, LATIN SMALL LETTER LONG S WITH DOT ABOVE 56 (0x1e61, 0x1e9b), # ṡẛ 57 # LATIN SMALL LIGATURE LONG S T, LATIN SMALL LIGATURE ST 58 (0xfb05, 0xfb06), # ſtst 59) 60 61# Maps the lowercase code to lowercase codes which have the same uppercase. 62_ignorecase_fixes = {i: tuple(j for j in t if i != j) 63 for t in _equivalences for i in t} 64 65def _combine_flags(flags, add_flags, del_flags, 66 TYPE_FLAGS=sre_parse.TYPE_FLAGS): 67 if add_flags & TYPE_FLAGS: 68 flags &= ~TYPE_FLAGS 69 return (flags | add_flags) & ~del_flags 70 71def _compile(code, pattern, flags): 72 # internal: compile a (sub)pattern 73 emit = code.append 74 _len = len 75 LITERAL_CODES = _LITERAL_CODES 76 REPEATING_CODES = _REPEATING_CODES 77 SUCCESS_CODES = _SUCCESS_CODES 78 ASSERT_CODES = _ASSERT_CODES 79 iscased = None 80 tolower = None 81 fixes = None 82 if flags & SRE_FLAG_IGNORECASE and not flags & SRE_FLAG_LOCALE: 83 if flags & SRE_FLAG_UNICODE: 84 iscased = _sre.unicode_iscased 85 tolower = _sre.unicode_tolower 86 fixes = _ignorecase_fixes 87 else: 88 iscased = _sre.ascii_iscased 89 tolower = _sre.ascii_tolower 90 for op, av in pattern: 91 if op in LITERAL_CODES: 92 if not flags & SRE_FLAG_IGNORECASE: 93 emit(op) 94 emit(av) 95 elif flags & SRE_FLAG_LOCALE: 96 emit(OP_LOCALE_IGNORE[op]) 97 emit(av) 98 elif not iscased(av): 99 emit(op) 100 emit(av) 101 else: 102 lo = tolower(av) 103 if not fixes: # ascii 104 emit(OP_IGNORE[op]) 105 emit(lo) 106 elif lo not in fixes: 107 emit(OP_UNICODE_IGNORE[op]) 108 emit(lo) 109 else: 110 emit(IN_UNI_IGNORE) 111 skip = _len(code); emit(0) 112 if op is NOT_LITERAL: 113 emit(NEGATE) 114 for k in (lo,) + fixes[lo]: 115 emit(LITERAL) 116 emit(k) 117 emit(FAILURE) 118 code[skip] = _len(code) - skip 119 elif op is IN: 120 charset, hascased = _optimize_charset(av, iscased, tolower, fixes) 121 if flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE: 122 emit(IN_LOC_IGNORE) 123 elif not hascased: 124 emit(IN) 125 elif not fixes: # ascii 126 emit(IN_IGNORE) 127 else: 128 emit(IN_UNI_IGNORE) 129 skip = _len(code); emit(0) 130 _compile_charset(charset, flags, code) 131 code[skip] = _len(code) - skip 132 elif op is ANY: 133 if flags & SRE_FLAG_DOTALL: 134 emit(ANY_ALL) 135 else: 136 emit(ANY) 137 elif op in REPEATING_CODES: 138 if flags & SRE_FLAG_TEMPLATE: 139 raise error("internal: unsupported template operator %r" % (op,)) 140 if _simple(av[2]): 141 if op is MAX_REPEAT: 142 emit(REPEAT_ONE) 143 else: 144 emit(MIN_REPEAT_ONE) 145 skip = _len(code); emit(0) 146 emit(av[0]) 147 emit(av[1]) 148 _compile(code, av[2], flags) 149 emit(SUCCESS) 150 code[skip] = _len(code) - skip 151 else: 152 emit(REPEAT) 153 skip = _len(code); emit(0) 154 emit(av[0]) 155 emit(av[1]) 156 _compile(code, av[2], flags) 157 code[skip] = _len(code) - skip 158 if op is MAX_REPEAT: 159 emit(MAX_UNTIL) 160 else: 161 emit(MIN_UNTIL) 162 elif op is SUBPATTERN: 163 group, add_flags, del_flags, p = av 164 if group: 165 emit(MARK) 166 emit((group-1)*2) 167 # _compile_info(code, p, _combine_flags(flags, add_flags, del_flags)) 168 _compile(code, p, _combine_flags(flags, add_flags, del_flags)) 169 if group: 170 emit(MARK) 171 emit((group-1)*2+1) 172 elif op in SUCCESS_CODES: 173 emit(op) 174 elif op in ASSERT_CODES: 175 emit(op) 176 skip = _len(code); emit(0) 177 if av[0] >= 0: 178 emit(0) # look ahead 179 else: 180 lo, hi = av[1].getwidth() 181 if lo != hi: 182 raise error("look-behind requires fixed-width pattern") 183 emit(lo) # look behind 184 _compile(code, av[1], flags) 185 emit(SUCCESS) 186 code[skip] = _len(code) - skip 187 elif op is CALL: 188 emit(op) 189 skip = _len(code); emit(0) 190 _compile(code, av, flags) 191 emit(SUCCESS) 192 code[skip] = _len(code) - skip 193 elif op is AT: 194 emit(op) 195 if flags & SRE_FLAG_MULTILINE: 196 av = AT_MULTILINE.get(av, av) 197 if flags & SRE_FLAG_LOCALE: 198 av = AT_LOCALE.get(av, av) 199 elif flags & SRE_FLAG_UNICODE: 200 av = AT_UNICODE.get(av, av) 201 emit(av) 202 elif op is BRANCH: 203 emit(op) 204 tail = [] 205 tailappend = tail.append 206 for av in av[1]: 207 skip = _len(code); emit(0) 208 # _compile_info(code, av, flags) 209 _compile(code, av, flags) 210 emit(JUMP) 211 tailappend(_len(code)); emit(0) 212 code[skip] = _len(code) - skip 213 emit(FAILURE) # end of branch 214 for tail in tail: 215 code[tail] = _len(code) - tail 216 elif op is CATEGORY: 217 emit(op) 218 if flags & SRE_FLAG_LOCALE: 219 av = CH_LOCALE[av] 220 elif flags & SRE_FLAG_UNICODE: 221 av = CH_UNICODE[av] 222 emit(av) 223 elif op is GROUPREF: 224 if not flags & SRE_FLAG_IGNORECASE: 225 emit(op) 226 elif flags & SRE_FLAG_LOCALE: 227 emit(GROUPREF_LOC_IGNORE) 228 elif not fixes: # ascii 229 emit(GROUPREF_IGNORE) 230 else: 231 emit(GROUPREF_UNI_IGNORE) 232 emit(av-1) 233 elif op is GROUPREF_EXISTS: 234 emit(op) 235 emit(av[0]-1) 236 skipyes = _len(code); emit(0) 237 _compile(code, av[1], flags) 238 if av[2]: 239 emit(JUMP) 240 skipno = _len(code); emit(0) 241 code[skipyes] = _len(code) - skipyes + 1 242 _compile(code, av[2], flags) 243 code[skipno] = _len(code) - skipno 244 else: 245 code[skipyes] = _len(code) - skipyes + 1 246 else: 247 raise error("internal: unsupported operand type %r" % (op,)) 248 249def _compile_charset(charset, flags, code): 250 # compile charset subprogram 251 emit = code.append 252 for op, av in charset: 253 emit(op) 254 if op is NEGATE: 255 pass 256 elif op is LITERAL: 257 emit(av) 258 elif op is RANGE or op is RANGE_UNI_IGNORE: 259 emit(av[0]) 260 emit(av[1]) 261 elif op is CHARSET: 262 code.extend(av) 263 elif op is BIGCHARSET: 264 code.extend(av) 265 elif op is CATEGORY: 266 if flags & SRE_FLAG_LOCALE: 267 emit(CH_LOCALE[av]) 268 elif flags & SRE_FLAG_UNICODE: 269 emit(CH_UNICODE[av]) 270 else: 271 emit(av) 272 else: 273 raise error("internal: unsupported set operator %r" % (op,)) 274 emit(FAILURE) 275 276def _optimize_charset(charset, iscased=None, fixup=None, fixes=None): 277 # internal: optimize character set 278 out = [] 279 tail = [] 280 charmap = bytearray(256) 281 hascased = False 282 for op, av in charset: 283 while True: 284 try: 285 if op is LITERAL: 286 if fixup: 287 lo = fixup(av) 288 charmap[lo] = 1 289 if fixes and lo in fixes: 290 for k in fixes[lo]: 291 charmap[k] = 1 292 if not hascased and iscased(av): 293 hascased = True 294 else: 295 charmap[av] = 1 296 elif op is RANGE: 297 r = range(av[0], av[1]+1) 298 if fixup: 299 if fixes: 300 for i in map(fixup, r): 301 charmap[i] = 1 302 if i in fixes: 303 for k in fixes[i]: 304 charmap[k] = 1 305 else: 306 for i in map(fixup, r): 307 charmap[i] = 1 308 if not hascased: 309 hascased = any(map(iscased, r)) 310 else: 311 for i in r: 312 charmap[i] = 1 313 elif op is NEGATE: 314 out.append((op, av)) 315 else: 316 tail.append((op, av)) 317 except IndexError: 318 if len(charmap) == 256: 319 # character set contains non-UCS1 character codes 320 charmap += b'\0' * 0xff00 321 continue 322 # Character set contains non-BMP character codes. 323 if fixup: 324 hascased = True 325 # There are only two ranges of cased non-BMP characters: 326 # 10400-1044F (Deseret) and 118A0-118DF (Warang Citi), 327 # and for both ranges RANGE_UNI_IGNORE works. 328 if op is RANGE: 329 op = RANGE_UNI_IGNORE 330 tail.append((op, av)) 331 break 332 333 # compress character map 334 runs = [] 335 q = 0 336 while True: 337 p = charmap.find(1, q) 338 if p < 0: 339 break 340 if len(runs) >= 2: 341 runs = None 342 break 343 q = charmap.find(0, p) 344 if q < 0: 345 runs.append((p, len(charmap))) 346 break 347 runs.append((p, q)) 348 if runs is not None: 349 # use literal/range 350 for p, q in runs: 351 if q - p == 1: 352 out.append((LITERAL, p)) 353 else: 354 out.append((RANGE, (p, q - 1))) 355 out += tail 356 # if the case was changed or new representation is more compact 357 if hascased or len(out) < len(charset): 358 return out, hascased 359 # else original character set is good enough 360 return charset, hascased 361 362 # use bitmap 363 if len(charmap) == 256: 364 data = _mk_bitmap(charmap) 365 out.append((CHARSET, data)) 366 out += tail 367 return out, hascased 368 369 # To represent a big charset, first a bitmap of all characters in the 370 # set is constructed. Then, this bitmap is sliced into chunks of 256 371 # characters, duplicate chunks are eliminated, and each chunk is 372 # given a number. In the compiled expression, the charset is 373 # represented by a 32-bit word sequence, consisting of one word for 374 # the number of different chunks, a sequence of 256 bytes (64 words) 375 # of chunk numbers indexed by their original chunk position, and a 376 # sequence of 256-bit chunks (8 words each). 377 378 # Compression is normally good: in a typical charset, large ranges of 379 # Unicode will be either completely excluded (e.g. if only cyrillic 380 # letters are to be matched), or completely included (e.g. if large 381 # subranges of Kanji match). These ranges will be represented by 382 # chunks of all one-bits or all zero-bits. 383 384 # Matching can be also done efficiently: the more significant byte of 385 # the Unicode character is an index into the chunk number, and the 386 # less significant byte is a bit index in the chunk (just like the 387 # CHARSET matching). 388 389 charmap = bytes(charmap) # should be hashable 390 comps = {} 391 mapping = bytearray(256) 392 block = 0 393 data = bytearray() 394 for i in range(0, 65536, 256): 395 chunk = charmap[i: i + 256] 396 if chunk in comps: 397 mapping[i // 256] = comps[chunk] 398 else: 399 mapping[i // 256] = comps[chunk] = block 400 block += 1 401 data += chunk 402 data = _mk_bitmap(data) 403 data[0:0] = [block] + _bytes_to_codes(mapping) 404 out.append((BIGCHARSET, data)) 405 out += tail 406 return out, hascased 407 408_CODEBITS = _sre.CODESIZE * 8 409MAXCODE = (1 << _CODEBITS) - 1 410_BITS_TRANS = b'0' + b'1' * 255 411def _mk_bitmap(bits, _CODEBITS=_CODEBITS, _int=int): 412 s = bits.translate(_BITS_TRANS)[::-1] 413 return [_int(s[i - _CODEBITS: i], 2) 414 for i in range(len(s), 0, -_CODEBITS)] 415 416def _bytes_to_codes(b): 417 # Convert block indices to word array 418 a = memoryview(b).cast('I') 419 assert a.itemsize == _sre.CODESIZE 420 assert len(a) * a.itemsize == len(b) 421 return a.tolist() 422 423def _simple(p): 424 # check if this subpattern is a "simple" operator 425 if len(p) != 1: 426 return False 427 op, av = p[0] 428 if op is SUBPATTERN: 429 return av[0] is None and _simple(av[-1]) 430 return op in _UNIT_CODES 431 432def _generate_overlap_table(prefix): 433 """ 434 Generate an overlap table for the following prefix. 435 An overlap table is a table of the same size as the prefix which 436 informs about the potential self-overlap for each index in the prefix: 437 - if overlap[i] == 0, prefix[i:] can't overlap prefix[0:...] 438 - if overlap[i] == k with 0 < k <= i, prefix[i-k+1:i+1] overlaps with 439 prefix[0:k] 440 """ 441 table = [0] * len(prefix) 442 for i in range(1, len(prefix)): 443 idx = table[i - 1] 444 while prefix[i] != prefix[idx]: 445 if idx == 0: 446 table[i] = 0 447 break 448 idx = table[idx - 1] 449 else: 450 table[i] = idx + 1 451 return table 452 453def _get_iscased(flags): 454 if not flags & SRE_FLAG_IGNORECASE: 455 return None 456 elif flags & SRE_FLAG_UNICODE: 457 return _sre.unicode_iscased 458 else: 459 return _sre.ascii_iscased 460 461def _get_literal_prefix(pattern, flags): 462 # look for literal prefix 463 prefix = [] 464 prefixappend = prefix.append 465 prefix_skip = None 466 iscased = _get_iscased(flags) 467 for op, av in pattern.data: 468 if op is LITERAL: 469 if iscased and iscased(av): 470 break 471 prefixappend(av) 472 elif op is SUBPATTERN: 473 group, add_flags, del_flags, p = av 474 flags1 = _combine_flags(flags, add_flags, del_flags) 475 if flags1 & SRE_FLAG_IGNORECASE and flags1 & SRE_FLAG_LOCALE: 476 break 477 prefix1, prefix_skip1, got_all = _get_literal_prefix(p, flags1) 478 if prefix_skip is None: 479 if group is not None: 480 prefix_skip = len(prefix) 481 elif prefix_skip1 is not None: 482 prefix_skip = len(prefix) + prefix_skip1 483 prefix.extend(prefix1) 484 if not got_all: 485 break 486 else: 487 break 488 else: 489 return prefix, prefix_skip, True 490 return prefix, prefix_skip, False 491 492def _get_charset_prefix(pattern, flags): 493 while True: 494 if not pattern.data: 495 return None 496 op, av = pattern.data[0] 497 if op is not SUBPATTERN: 498 break 499 group, add_flags, del_flags, pattern = av 500 flags = _combine_flags(flags, add_flags, del_flags) 501 if flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE: 502 return None 503 504 iscased = _get_iscased(flags) 505 if op is LITERAL: 506 if iscased and iscased(av): 507 return None 508 return [(op, av)] 509 elif op is BRANCH: 510 charset = [] 511 charsetappend = charset.append 512 for p in av[1]: 513 if not p: 514 return None 515 op, av = p[0] 516 if op is LITERAL and not (iscased and iscased(av)): 517 charsetappend((op, av)) 518 else: 519 return None 520 return charset 521 elif op is IN: 522 charset = av 523 if iscased: 524 for op, av in charset: 525 if op is LITERAL: 526 if iscased(av): 527 return None 528 elif op is RANGE: 529 if av[1] > 0xffff: 530 return None 531 if any(map(iscased, range(av[0], av[1]+1))): 532 return None 533 return charset 534 return None 535 536def _compile_info(code, pattern, flags): 537 # internal: compile an info block. in the current version, 538 # this contains min/max pattern width, and an optional literal 539 # prefix or a character map 540 lo, hi = pattern.getwidth() 541 if hi > MAXCODE: 542 hi = MAXCODE 543 if lo == 0: 544 code.extend([INFO, 4, 0, lo, hi]) 545 return 546 # look for a literal prefix 547 prefix = [] 548 prefix_skip = 0 549 charset = [] # not used 550 if not (flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE): 551 # look for literal prefix 552 prefix, prefix_skip, got_all = _get_literal_prefix(pattern, flags) 553 # if no prefix, look for charset prefix 554 if not prefix: 555 charset = _get_charset_prefix(pattern, flags) 556## if prefix: 557## print("*** PREFIX", prefix, prefix_skip) 558## if charset: 559## print("*** CHARSET", charset) 560 # add an info block 561 emit = code.append 562 emit(INFO) 563 skip = len(code); emit(0) 564 # literal flag 565 mask = 0 566 if prefix: 567 mask = SRE_INFO_PREFIX 568 if prefix_skip is None and got_all: 569 mask = mask | SRE_INFO_LITERAL 570 elif charset: 571 mask = mask | SRE_INFO_CHARSET 572 emit(mask) 573 # pattern length 574 if lo < MAXCODE: 575 emit(lo) 576 else: 577 emit(MAXCODE) 578 prefix = prefix[:MAXCODE] 579 emit(min(hi, MAXCODE)) 580 # add literal prefix 581 if prefix: 582 emit(len(prefix)) # length 583 if prefix_skip is None: 584 prefix_skip = len(prefix) 585 emit(prefix_skip) # skip 586 code.extend(prefix) 587 # generate overlap table 588 code.extend(_generate_overlap_table(prefix)) 589 elif charset: 590 charset, hascased = _optimize_charset(charset) 591 assert not hascased 592 _compile_charset(charset, flags, code) 593 code[skip] = len(code) - skip 594 595def isstring(obj): 596 return isinstance(obj, (str, bytes)) 597 598def _code(p, flags): 599 600 flags = p.state.flags | flags 601 code = [] 602 603 # compile info block 604 _compile_info(code, p, flags) 605 606 # compile the pattern 607 _compile(code, p.data, flags) 608 609 code.append(SUCCESS) 610 611 return code 612 613def _hex_code(code): 614 return '[%s]' % ', '.join('%#0*x' % (_sre.CODESIZE*2+2, x) for x in code) 615 616def dis(code): 617 import sys 618 619 labels = set() 620 level = 0 621 offset_width = len(str(len(code) - 1)) 622 623 def dis_(start, end): 624 def print_(*args, to=None): 625 if to is not None: 626 labels.add(to) 627 args += ('(to %d)' % (to,),) 628 print('%*d%s ' % (offset_width, start, ':' if start in labels else '.'), 629 end=' '*(level-1)) 630 print(*args) 631 632 def print_2(*args): 633 print(end=' '*(offset_width + 2*level)) 634 print(*args) 635 636 nonlocal level 637 level += 1 638 i = start 639 while i < end: 640 start = i 641 op = code[i] 642 i += 1 643 op = OPCODES[op] 644 if op in (SUCCESS, FAILURE, ANY, ANY_ALL, 645 MAX_UNTIL, MIN_UNTIL, NEGATE): 646 print_(op) 647 elif op in (LITERAL, NOT_LITERAL, 648 LITERAL_IGNORE, NOT_LITERAL_IGNORE, 649 LITERAL_UNI_IGNORE, NOT_LITERAL_UNI_IGNORE, 650 LITERAL_LOC_IGNORE, NOT_LITERAL_LOC_IGNORE): 651 arg = code[i] 652 i += 1 653 print_(op, '%#02x (%r)' % (arg, chr(arg))) 654 elif op is AT: 655 arg = code[i] 656 i += 1 657 arg = str(ATCODES[arg]) 658 assert arg[:3] == 'AT_' 659 print_(op, arg[3:]) 660 elif op is CATEGORY: 661 arg = code[i] 662 i += 1 663 arg = str(CHCODES[arg]) 664 assert arg[:9] == 'CATEGORY_' 665 print_(op, arg[9:]) 666 elif op in (IN, IN_IGNORE, IN_UNI_IGNORE, IN_LOC_IGNORE): 667 skip = code[i] 668 print_(op, skip, to=i+skip) 669 dis_(i+1, i+skip) 670 i += skip 671 elif op in (RANGE, RANGE_UNI_IGNORE): 672 lo, hi = code[i: i+2] 673 i += 2 674 print_(op, '%#02x %#02x (%r-%r)' % (lo, hi, chr(lo), chr(hi))) 675 elif op is CHARSET: 676 print_(op, _hex_code(code[i: i + 256//_CODEBITS])) 677 i += 256//_CODEBITS 678 elif op is BIGCHARSET: 679 arg = code[i] 680 i += 1 681 mapping = list(b''.join(x.to_bytes(_sre.CODESIZE, sys.byteorder) 682 for x in code[i: i + 256//_sre.CODESIZE])) 683 print_(op, arg, mapping) 684 i += 256//_sre.CODESIZE 685 level += 1 686 for j in range(arg): 687 print_2(_hex_code(code[i: i + 256//_CODEBITS])) 688 i += 256//_CODEBITS 689 level -= 1 690 elif op in (MARK, GROUPREF, GROUPREF_IGNORE, GROUPREF_UNI_IGNORE, 691 GROUPREF_LOC_IGNORE): 692 arg = code[i] 693 i += 1 694 print_(op, arg) 695 elif op is JUMP: 696 skip = code[i] 697 print_(op, skip, to=i+skip) 698 i += 1 699 elif op is BRANCH: 700 skip = code[i] 701 print_(op, skip, to=i+skip) 702 while skip: 703 dis_(i+1, i+skip) 704 i += skip 705 start = i 706 skip = code[i] 707 if skip: 708 print_('branch', skip, to=i+skip) 709 else: 710 print_(FAILURE) 711 i += 1 712 elif op in (REPEAT, REPEAT_ONE, MIN_REPEAT_ONE): 713 skip, min, max = code[i: i+3] 714 if max == MAXREPEAT: 715 max = 'MAXREPEAT' 716 print_(op, skip, min, max, to=i+skip) 717 dis_(i+3, i+skip) 718 i += skip 719 elif op is GROUPREF_EXISTS: 720 arg, skip = code[i: i+2] 721 print_(op, arg, skip, to=i+skip) 722 i += 2 723 elif op in (ASSERT, ASSERT_NOT): 724 skip, arg = code[i: i+2] 725 print_(op, skip, arg, to=i+skip) 726 dis_(i+2, i+skip) 727 i += skip 728 elif op is INFO: 729 skip, flags, min, max = code[i: i+4] 730 if max == MAXREPEAT: 731 max = 'MAXREPEAT' 732 print_(op, skip, bin(flags), min, max, to=i+skip) 733 start = i+4 734 if flags & SRE_INFO_PREFIX: 735 prefix_len, prefix_skip = code[i+4: i+6] 736 print_2(' prefix_skip', prefix_skip) 737 start = i + 6 738 prefix = code[start: start+prefix_len] 739 print_2(' prefix', 740 '[%s]' % ', '.join('%#02x' % x for x in prefix), 741 '(%r)' % ''.join(map(chr, prefix))) 742 start += prefix_len 743 print_2(' overlap', code[start: start+prefix_len]) 744 start += prefix_len 745 if flags & SRE_INFO_CHARSET: 746 level += 1 747 print_2('in') 748 dis_(start, i+skip) 749 level -= 1 750 i += skip 751 else: 752 raise ValueError(op) 753 754 level -= 1 755 756 dis_(0, len(code)) 757 758 759def compile(p, flags=0): 760 # internal: convert pattern list to internal format 761 762 if isstring(p): 763 pattern = p 764 p = sre_parse.parse(p, flags) 765 else: 766 pattern = None 767 768 code = _code(p, flags) 769 770 if flags & SRE_FLAG_DEBUG: 771 print() 772 dis(code) 773 774 # map in either direction 775 groupindex = p.state.groupdict 776 indexgroup = [None] * p.state.groups 777 for k, i in groupindex.items(): 778 indexgroup[i] = k 779 780 return _sre.compile( 781 pattern, flags | p.state.flags, code, 782 p.state.groups-1, 783 groupindex, tuple(indexgroup) 784 ) 785