• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#
2# Secret Labs' Regular Expression Engine
3#
4# convert template to internal format
5#
6# Copyright (c) 1997-2001 by Secret Labs AB.  All rights reserved.
7#
8# See the __init__.py file for information on usage and redistribution.
9#
10
11"""Internal support module for sre"""
12
13import _sre
14from . import _parser
15from ._constants import *
16from ._casefix import _EXTRA_CASES
17
18assert _sre.MAGIC == MAGIC, "SRE module mismatch"
19
20_LITERAL_CODES = {LITERAL, NOT_LITERAL}
21_SUCCESS_CODES = {SUCCESS, FAILURE}
22_ASSERT_CODES = {ASSERT, ASSERT_NOT}
23_UNIT_CODES = _LITERAL_CODES | {ANY, IN}
24
25_REPEATING_CODES = {
26    MIN_REPEAT: (REPEAT, MIN_UNTIL, MIN_REPEAT_ONE),
27    MAX_REPEAT: (REPEAT, MAX_UNTIL, REPEAT_ONE),
28    POSSESSIVE_REPEAT: (POSSESSIVE_REPEAT, SUCCESS, POSSESSIVE_REPEAT_ONE),
29}
30
31def _combine_flags(flags, add_flags, del_flags,
32                   TYPE_FLAGS=_parser.TYPE_FLAGS):
33    if add_flags & TYPE_FLAGS:
34        flags &= ~TYPE_FLAGS
35    return (flags | add_flags) & ~del_flags
36
37def _compile(code, pattern, flags):
38    # internal: compile a (sub)pattern
39    emit = code.append
40    _len = len
41    LITERAL_CODES = _LITERAL_CODES
42    REPEATING_CODES = _REPEATING_CODES
43    SUCCESS_CODES = _SUCCESS_CODES
44    ASSERT_CODES = _ASSERT_CODES
45    iscased = None
46    tolower = None
47    fixes = None
48    if flags & SRE_FLAG_IGNORECASE and not flags & SRE_FLAG_LOCALE:
49        if flags & SRE_FLAG_UNICODE:
50            iscased = _sre.unicode_iscased
51            tolower = _sre.unicode_tolower
52            fixes = _EXTRA_CASES
53        else:
54            iscased = _sre.ascii_iscased
55            tolower = _sre.ascii_tolower
56    for op, av in pattern:
57        if op in LITERAL_CODES:
58            if not flags & SRE_FLAG_IGNORECASE:
59                emit(op)
60                emit(av)
61            elif flags & SRE_FLAG_LOCALE:
62                emit(OP_LOCALE_IGNORE[op])
63                emit(av)
64            elif not iscased(av):
65                emit(op)
66                emit(av)
67            else:
68                lo = tolower(av)
69                if not fixes:  # ascii
70                    emit(OP_IGNORE[op])
71                    emit(lo)
72                elif lo not in fixes:
73                    emit(OP_UNICODE_IGNORE[op])
74                    emit(lo)
75                else:
76                    emit(IN_UNI_IGNORE)
77                    skip = _len(code); emit(0)
78                    if op is NOT_LITERAL:
79                        emit(NEGATE)
80                    for k in (lo,) + fixes[lo]:
81                        emit(LITERAL)
82                        emit(k)
83                    emit(FAILURE)
84                    code[skip] = _len(code) - skip
85        elif op is IN:
86            charset, hascased = _optimize_charset(av, iscased, tolower, fixes)
87            if flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE:
88                emit(IN_LOC_IGNORE)
89            elif not hascased:
90                emit(IN)
91            elif not fixes:  # ascii
92                emit(IN_IGNORE)
93            else:
94                emit(IN_UNI_IGNORE)
95            skip = _len(code); emit(0)
96            _compile_charset(charset, flags, code)
97            code[skip] = _len(code) - skip
98        elif op is ANY:
99            if flags & SRE_FLAG_DOTALL:
100                emit(ANY_ALL)
101            else:
102                emit(ANY)
103        elif op in REPEATING_CODES:
104            if _simple(av[2]):
105                emit(REPEATING_CODES[op][2])
106                skip = _len(code); emit(0)
107                emit(av[0])
108                emit(av[1])
109                _compile(code, av[2], flags)
110                emit(SUCCESS)
111                code[skip] = _len(code) - skip
112            else:
113                emit(REPEATING_CODES[op][0])
114                skip = _len(code); emit(0)
115                emit(av[0])
116                emit(av[1])
117                _compile(code, av[2], flags)
118                code[skip] = _len(code) - skip
119                emit(REPEATING_CODES[op][1])
120        elif op is SUBPATTERN:
121            group, add_flags, del_flags, p = av
122            if group:
123                emit(MARK)
124                emit((group-1)*2)
125            # _compile_info(code, p, _combine_flags(flags, add_flags, del_flags))
126            _compile(code, p, _combine_flags(flags, add_flags, del_flags))
127            if group:
128                emit(MARK)
129                emit((group-1)*2+1)
130        elif op is ATOMIC_GROUP:
131            # Atomic Groups are handled by starting with an Atomic
132            # Group op code, then putting in the atomic group pattern
133            # and finally a success op code to tell any repeat
134            # operations within the Atomic Group to stop eating and
135            # pop their stack if they reach it
136            emit(ATOMIC_GROUP)
137            skip = _len(code); emit(0)
138            _compile(code, av, flags)
139            emit(SUCCESS)
140            code[skip] = _len(code) - skip
141        elif op in SUCCESS_CODES:
142            emit(op)
143        elif op in ASSERT_CODES:
144            emit(op)
145            skip = _len(code); emit(0)
146            if av[0] >= 0:
147                emit(0) # look ahead
148            else:
149                lo, hi = av[1].getwidth()
150                if lo > MAXCODE:
151                    raise error("looks too much behind")
152                if lo != hi:
153                    raise PatternError("look-behind requires fixed-width pattern")
154                emit(lo) # look behind
155            _compile(code, av[1], flags)
156            emit(SUCCESS)
157            code[skip] = _len(code) - skip
158        elif op is AT:
159            emit(op)
160            if flags & SRE_FLAG_MULTILINE:
161                av = AT_MULTILINE.get(av, av)
162            if flags & SRE_FLAG_LOCALE:
163                av = AT_LOCALE.get(av, av)
164            elif flags & SRE_FLAG_UNICODE:
165                av = AT_UNICODE.get(av, av)
166            emit(av)
167        elif op is BRANCH:
168            emit(op)
169            tail = []
170            tailappend = tail.append
171            for av in av[1]:
172                skip = _len(code); emit(0)
173                # _compile_info(code, av, flags)
174                _compile(code, av, flags)
175                emit(JUMP)
176                tailappend(_len(code)); emit(0)
177                code[skip] = _len(code) - skip
178            emit(FAILURE) # end of branch
179            for tail in tail:
180                code[tail] = _len(code) - tail
181        elif op is CATEGORY:
182            emit(op)
183            if flags & SRE_FLAG_LOCALE:
184                av = CH_LOCALE[av]
185            elif flags & SRE_FLAG_UNICODE:
186                av = CH_UNICODE[av]
187            emit(av)
188        elif op is GROUPREF:
189            if not flags & SRE_FLAG_IGNORECASE:
190                emit(op)
191            elif flags & SRE_FLAG_LOCALE:
192                emit(GROUPREF_LOC_IGNORE)
193            elif not fixes:  # ascii
194                emit(GROUPREF_IGNORE)
195            else:
196                emit(GROUPREF_UNI_IGNORE)
197            emit(av-1)
198        elif op is GROUPREF_EXISTS:
199            emit(op)
200            emit(av[0]-1)
201            skipyes = _len(code); emit(0)
202            _compile(code, av[1], flags)
203            if av[2]:
204                emit(JUMP)
205                skipno = _len(code); emit(0)
206                code[skipyes] = _len(code) - skipyes + 1
207                _compile(code, av[2], flags)
208                code[skipno] = _len(code) - skipno
209            else:
210                code[skipyes] = _len(code) - skipyes + 1
211        else:
212            raise PatternError(f"internal: unsupported operand type {op!r}")
213
214def _compile_charset(charset, flags, code):
215    # compile charset subprogram
216    emit = code.append
217    for op, av in charset:
218        emit(op)
219        if op is NEGATE:
220            pass
221        elif op is LITERAL:
222            emit(av)
223        elif op is RANGE or op is RANGE_UNI_IGNORE:
224            emit(av[0])
225            emit(av[1])
226        elif op is CHARSET:
227            code.extend(av)
228        elif op is BIGCHARSET:
229            code.extend(av)
230        elif op is CATEGORY:
231            if flags & SRE_FLAG_LOCALE:
232                emit(CH_LOCALE[av])
233            elif flags & SRE_FLAG_UNICODE:
234                emit(CH_UNICODE[av])
235            else:
236                emit(av)
237        else:
238            raise PatternError(f"internal: unsupported set operator {op!r}")
239    emit(FAILURE)
240
241def _optimize_charset(charset, iscased=None, fixup=None, fixes=None):
242    # internal: optimize character set
243    out = []
244    tail = []
245    charmap = bytearray(256)
246    hascased = False
247    for op, av in charset:
248        while True:
249            try:
250                if op is LITERAL:
251                    if fixup: # IGNORECASE and not LOCALE
252                        av = fixup(av)
253                        charmap[av] = 1
254                        if fixes and av in fixes:
255                            for k in fixes[av]:
256                                charmap[k] = 1
257                        if not hascased and iscased(av):
258                            hascased = True
259                    else:
260                        charmap[av] = 1
261                elif op is RANGE:
262                    r = range(av[0], av[1]+1)
263                    if fixup: # IGNORECASE and not LOCALE
264                        if fixes:
265                            for i in map(fixup, r):
266                                charmap[i] = 1
267                                if i in fixes:
268                                    for k in fixes[i]:
269                                        charmap[k] = 1
270                        else:
271                            for i in map(fixup, r):
272                                charmap[i] = 1
273                        if not hascased:
274                            hascased = any(map(iscased, r))
275                    else:
276                        for i in r:
277                            charmap[i] = 1
278                elif op is NEGATE:
279                    out.append((op, av))
280                else:
281                    tail.append((op, av))
282            except IndexError:
283                if len(charmap) == 256:
284                    # character set contains non-UCS1 character codes
285                    charmap += b'\0' * 0xff00
286                    continue
287                # Character set contains non-BMP character codes.
288                # For range, all BMP characters in the range are already
289                # proceeded.
290                if fixup: # IGNORECASE and not LOCALE
291                    # For now, IN_UNI_IGNORE+LITERAL and
292                    # IN_UNI_IGNORE+RANGE_UNI_IGNORE work for all non-BMP
293                    # characters, because two characters (at least one of
294                    # which is not in the BMP) match case-insensitively
295                    # if and only if:
296                    # 1) c1.lower() == c2.lower()
297                    # 2) c1.lower() == c2 or c1.lower().upper() == c2
298                    # Also, both c.lower() and c.lower().upper() are single
299                    # characters for every non-BMP character.
300                    if op is RANGE:
301                        if fixes: # not ASCII
302                            op = RANGE_UNI_IGNORE
303                        hascased = True
304                    else:
305                        assert op is LITERAL
306                        if not hascased and iscased(av):
307                            hascased = True
308                tail.append((op, av))
309            break
310
311    # compress character map
312    runs = []
313    q = 0
314    while True:
315        p = charmap.find(1, q)
316        if p < 0:
317            break
318        if len(runs) >= 2:
319            runs = None
320            break
321        q = charmap.find(0, p)
322        if q < 0:
323            runs.append((p, len(charmap)))
324            break
325        runs.append((p, q))
326    if runs is not None:
327        # use literal/range
328        for p, q in runs:
329            if q - p == 1:
330                out.append((LITERAL, p))
331            else:
332                out.append((RANGE, (p, q - 1)))
333        out += tail
334        # if the case was changed or new representation is more compact
335        if hascased or len(out) < len(charset):
336            return out, hascased
337        # else original character set is good enough
338        return charset, hascased
339
340    # use bitmap
341    if len(charmap) == 256:
342        data = _mk_bitmap(charmap)
343        out.append((CHARSET, data))
344        out += tail
345        return out, hascased
346
347    # To represent a big charset, first a bitmap of all characters in the
348    # set is constructed. Then, this bitmap is sliced into chunks of 256
349    # characters, duplicate chunks are eliminated, and each chunk is
350    # given a number. In the compiled expression, the charset is
351    # represented by a 32-bit word sequence, consisting of one word for
352    # the number of different chunks, a sequence of 256 bytes (64 words)
353    # of chunk numbers indexed by their original chunk position, and a
354    # sequence of 256-bit chunks (8 words each).
355
356    # Compression is normally good: in a typical charset, large ranges of
357    # Unicode will be either completely excluded (e.g. if only cyrillic
358    # letters are to be matched), or completely included (e.g. if large
359    # subranges of Kanji match). These ranges will be represented by
360    # chunks of all one-bits or all zero-bits.
361
362    # Matching can be also done efficiently: the more significant byte of
363    # the Unicode character is an index into the chunk number, and the
364    # less significant byte is a bit index in the chunk (just like the
365    # CHARSET matching).
366
367    charmap = bytes(charmap) # should be hashable
368    comps = {}
369    mapping = bytearray(256)
370    block = 0
371    data = bytearray()
372    for i in range(0, 65536, 256):
373        chunk = charmap[i: i + 256]
374        if chunk in comps:
375            mapping[i // 256] = comps[chunk]
376        else:
377            mapping[i // 256] = comps[chunk] = block
378            block += 1
379            data += chunk
380    data = _mk_bitmap(data)
381    data[0:0] = [block] + _bytes_to_codes(mapping)
382    out.append((BIGCHARSET, data))
383    out += tail
384    return out, hascased
385
386_CODEBITS = _sre.CODESIZE * 8
387MAXCODE = (1 << _CODEBITS) - 1
388_BITS_TRANS = b'0' + b'1' * 255
389def _mk_bitmap(bits, _CODEBITS=_CODEBITS, _int=int):
390    s = bits.translate(_BITS_TRANS)[::-1]
391    return [_int(s[i - _CODEBITS: i], 2)
392            for i in range(len(s), 0, -_CODEBITS)]
393
394def _bytes_to_codes(b):
395    # Convert block indices to word array
396    a = memoryview(b).cast('I')
397    assert a.itemsize == _sre.CODESIZE
398    assert len(a) * a.itemsize == len(b)
399    return a.tolist()
400
401def _simple(p):
402    # check if this subpattern is a "simple" operator
403    if len(p) != 1:
404        return False
405    op, av = p[0]
406    if op is SUBPATTERN:
407        return av[0] is None and _simple(av[-1])
408    return op in _UNIT_CODES
409
410def _generate_overlap_table(prefix):
411    """
412    Generate an overlap table for the following prefix.
413    An overlap table is a table of the same size as the prefix which
414    informs about the potential self-overlap for each index in the prefix:
415    - if overlap[i] == 0, prefix[i:] can't overlap prefix[0:...]
416    - if overlap[i] == k with 0 < k <= i, prefix[i-k+1:i+1] overlaps with
417      prefix[0:k]
418    """
419    table = [0] * len(prefix)
420    for i in range(1, len(prefix)):
421        idx = table[i - 1]
422        while prefix[i] != prefix[idx]:
423            if idx == 0:
424                table[i] = 0
425                break
426            idx = table[idx - 1]
427        else:
428            table[i] = idx + 1
429    return table
430
431def _get_iscased(flags):
432    if not flags & SRE_FLAG_IGNORECASE:
433        return None
434    elif flags & SRE_FLAG_UNICODE:
435        return _sre.unicode_iscased
436    else:
437        return _sre.ascii_iscased
438
439def _get_literal_prefix(pattern, flags):
440    # look for literal prefix
441    prefix = []
442    prefixappend = prefix.append
443    prefix_skip = None
444    iscased = _get_iscased(flags)
445    for op, av in pattern.data:
446        if op is LITERAL:
447            if iscased and iscased(av):
448                break
449            prefixappend(av)
450        elif op is SUBPATTERN:
451            group, add_flags, del_flags, p = av
452            flags1 = _combine_flags(flags, add_flags, del_flags)
453            if flags1 & SRE_FLAG_IGNORECASE and flags1 & SRE_FLAG_LOCALE:
454                break
455            prefix1, prefix_skip1, got_all = _get_literal_prefix(p, flags1)
456            if prefix_skip is None:
457                if group is not None:
458                    prefix_skip = len(prefix)
459                elif prefix_skip1 is not None:
460                    prefix_skip = len(prefix) + prefix_skip1
461            prefix.extend(prefix1)
462            if not got_all:
463                break
464        else:
465            break
466    else:
467        return prefix, prefix_skip, True
468    return prefix, prefix_skip, False
469
470def _get_charset_prefix(pattern, flags):
471    while True:
472        if not pattern.data:
473            return None
474        op, av = pattern.data[0]
475        if op is not SUBPATTERN:
476            break
477        group, add_flags, del_flags, pattern = av
478        flags = _combine_flags(flags, add_flags, del_flags)
479        if flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE:
480            return None
481
482    iscased = _get_iscased(flags)
483    if op is LITERAL:
484        if iscased and iscased(av):
485            return None
486        return [(op, av)]
487    elif op is BRANCH:
488        charset = []
489        charsetappend = charset.append
490        for p in av[1]:
491            if not p:
492                return None
493            op, av = p[0]
494            if op is LITERAL and not (iscased and iscased(av)):
495                charsetappend((op, av))
496            else:
497                return None
498        return charset
499    elif op is IN:
500        charset = av
501        if iscased:
502            for op, av in charset:
503                if op is LITERAL:
504                    if iscased(av):
505                        return None
506                elif op is RANGE:
507                    if av[1] > 0xffff:
508                        return None
509                    if any(map(iscased, range(av[0], av[1]+1))):
510                        return None
511        return charset
512    return None
513
514def _compile_info(code, pattern, flags):
515    # internal: compile an info block.  in the current version,
516    # this contains min/max pattern width, and an optional literal
517    # prefix or a character map
518    lo, hi = pattern.getwidth()
519    if hi > MAXCODE:
520        hi = MAXCODE
521    if lo == 0:
522        code.extend([INFO, 4, 0, lo, hi])
523        return
524    # look for a literal prefix
525    prefix = []
526    prefix_skip = 0
527    charset = [] # not used
528    if not (flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE):
529        # look for literal prefix
530        prefix, prefix_skip, got_all = _get_literal_prefix(pattern, flags)
531        # if no prefix, look for charset prefix
532        if not prefix:
533            charset = _get_charset_prefix(pattern, flags)
534##     if prefix:
535##         print("*** PREFIX", prefix, prefix_skip)
536##     if charset:
537##         print("*** CHARSET", charset)
538    # add an info block
539    emit = code.append
540    emit(INFO)
541    skip = len(code); emit(0)
542    # literal flag
543    mask = 0
544    if prefix:
545        mask = SRE_INFO_PREFIX
546        if prefix_skip is None and got_all:
547            mask = mask | SRE_INFO_LITERAL
548    elif charset:
549        mask = mask | SRE_INFO_CHARSET
550    emit(mask)
551    # pattern length
552    if lo < MAXCODE:
553        emit(lo)
554    else:
555        emit(MAXCODE)
556        prefix = prefix[:MAXCODE]
557    emit(hi)
558    # add literal prefix
559    if prefix:
560        emit(len(prefix)) # length
561        if prefix_skip is None:
562            prefix_skip =  len(prefix)
563        emit(prefix_skip) # skip
564        code.extend(prefix)
565        # generate overlap table
566        code.extend(_generate_overlap_table(prefix))
567    elif charset:
568        charset, hascased = _optimize_charset(charset)
569        assert not hascased
570        _compile_charset(charset, flags, code)
571    code[skip] = len(code) - skip
572
573def isstring(obj):
574    return isinstance(obj, (str, bytes))
575
576def _code(p, flags):
577
578    flags = p.state.flags | flags
579    code = []
580
581    # compile info block
582    _compile_info(code, p, flags)
583
584    # compile the pattern
585    _compile(code, p.data, flags)
586
587    code.append(SUCCESS)
588
589    return code
590
591def _hex_code(code):
592    return '[%s]' % ', '.join('%#0*x' % (_sre.CODESIZE*2+2, x) for x in code)
593
594def dis(code):
595    import sys
596
597    labels = set()
598    level = 0
599    offset_width = len(str(len(code) - 1))
600
601    def dis_(start, end):
602        def print_(*args, to=None):
603            if to is not None:
604                labels.add(to)
605                args += ('(to %d)' % (to,),)
606            print('%*d%s ' % (offset_width, start, ':' if start in labels else '.'),
607                  end='  '*(level-1))
608            print(*args)
609
610        def print_2(*args):
611            print(end=' '*(offset_width + 2*level))
612            print(*args)
613
614        nonlocal level
615        level += 1
616        i = start
617        while i < end:
618            start = i
619            op = code[i]
620            i += 1
621            op = OPCODES[op]
622            if op in (SUCCESS, FAILURE, ANY, ANY_ALL,
623                      MAX_UNTIL, MIN_UNTIL, NEGATE):
624                print_(op)
625            elif op in (LITERAL, NOT_LITERAL,
626                        LITERAL_IGNORE, NOT_LITERAL_IGNORE,
627                        LITERAL_UNI_IGNORE, NOT_LITERAL_UNI_IGNORE,
628                        LITERAL_LOC_IGNORE, NOT_LITERAL_LOC_IGNORE):
629                arg = code[i]
630                i += 1
631                print_(op, '%#02x (%r)' % (arg, chr(arg)))
632            elif op is AT:
633                arg = code[i]
634                i += 1
635                arg = str(ATCODES[arg])
636                assert arg[:3] == 'AT_'
637                print_(op, arg[3:])
638            elif op is CATEGORY:
639                arg = code[i]
640                i += 1
641                arg = str(CHCODES[arg])
642                assert arg[:9] == 'CATEGORY_'
643                print_(op, arg[9:])
644            elif op in (IN, IN_IGNORE, IN_UNI_IGNORE, IN_LOC_IGNORE):
645                skip = code[i]
646                print_(op, skip, to=i+skip)
647                dis_(i+1, i+skip)
648                i += skip
649            elif op in (RANGE, RANGE_UNI_IGNORE):
650                lo, hi = code[i: i+2]
651                i += 2
652                print_(op, '%#02x %#02x (%r-%r)' % (lo, hi, chr(lo), chr(hi)))
653            elif op is CHARSET:
654                print_(op, _hex_code(code[i: i + 256//_CODEBITS]))
655                i += 256//_CODEBITS
656            elif op is BIGCHARSET:
657                arg = code[i]
658                i += 1
659                mapping = list(b''.join(x.to_bytes(_sre.CODESIZE, sys.byteorder)
660                                        for x in code[i: i + 256//_sre.CODESIZE]))
661                print_(op, arg, mapping)
662                i += 256//_sre.CODESIZE
663                level += 1
664                for j in range(arg):
665                    print_2(_hex_code(code[i: i + 256//_CODEBITS]))
666                    i += 256//_CODEBITS
667                level -= 1
668            elif op in (MARK, GROUPREF, GROUPREF_IGNORE, GROUPREF_UNI_IGNORE,
669                        GROUPREF_LOC_IGNORE):
670                arg = code[i]
671                i += 1
672                print_(op, arg)
673            elif op is JUMP:
674                skip = code[i]
675                print_(op, skip, to=i+skip)
676                i += 1
677            elif op is BRANCH:
678                skip = code[i]
679                print_(op, skip, to=i+skip)
680                while skip:
681                    dis_(i+1, i+skip)
682                    i += skip
683                    start = i
684                    skip = code[i]
685                    if skip:
686                        print_('branch', skip, to=i+skip)
687                    else:
688                        print_(FAILURE)
689                i += 1
690            elif op in (REPEAT, REPEAT_ONE, MIN_REPEAT_ONE,
691                        POSSESSIVE_REPEAT, POSSESSIVE_REPEAT_ONE):
692                skip, min, max = code[i: i+3]
693                if max == MAXREPEAT:
694                    max = 'MAXREPEAT'
695                print_(op, skip, min, max, to=i+skip)
696                dis_(i+3, i+skip)
697                i += skip
698            elif op is GROUPREF_EXISTS:
699                arg, skip = code[i: i+2]
700                print_(op, arg, skip, to=i+skip)
701                i += 2
702            elif op in (ASSERT, ASSERT_NOT):
703                skip, arg = code[i: i+2]
704                print_(op, skip, arg, to=i+skip)
705                dis_(i+2, i+skip)
706                i += skip
707            elif op is ATOMIC_GROUP:
708                skip = code[i]
709                print_(op, skip, to=i+skip)
710                dis_(i+1, i+skip)
711                i += skip
712            elif op is INFO:
713                skip, flags, min, max = code[i: i+4]
714                if max == MAXREPEAT:
715                    max = 'MAXREPEAT'
716                print_(op, skip, bin(flags), min, max, to=i+skip)
717                start = i+4
718                if flags & SRE_INFO_PREFIX:
719                    prefix_len, prefix_skip = code[i+4: i+6]
720                    print_2('  prefix_skip', prefix_skip)
721                    start = i + 6
722                    prefix = code[start: start+prefix_len]
723                    print_2('  prefix',
724                            '[%s]' % ', '.join('%#02x' % x for x in prefix),
725                            '(%r)' % ''.join(map(chr, prefix)))
726                    start += prefix_len
727                    print_2('  overlap', code[start: start+prefix_len])
728                    start += prefix_len
729                if flags & SRE_INFO_CHARSET:
730                    level += 1
731                    print_2('in')
732                    dis_(start, i+skip)
733                    level -= 1
734                i += skip
735            else:
736                raise ValueError(op)
737
738        level -= 1
739
740    dis_(0, len(code))
741
742
743def compile(p, flags=0):
744    # internal: convert pattern list to internal format
745
746    if isstring(p):
747        pattern = p
748        p = _parser.parse(p, flags)
749    else:
750        pattern = None
751
752    code = _code(p, flags)
753
754    if flags & SRE_FLAG_DEBUG:
755        print()
756        dis(code)
757
758    # map in either direction
759    groupindex = p.state.groupdict
760    indexgroup = [None] * p.state.groups
761    for k, i in groupindex.items():
762        indexgroup[i] = k
763
764    return _sre.compile(
765        pattern, flags | p.state.flags, code,
766        p.state.groups-1,
767        groupindex, tuple(indexgroup)
768        )
769