• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#
2# Secret Labs' Regular Expression Engine
3#
4# convert template to internal format
5#
6# Copyright (c) 1997-2001 by Secret Labs AB.  All rights reserved.
7#
8# See the sre.py file for information on usage and redistribution.
9#
10
11"""Internal support module for sre"""
12
13import _sre
14import sre_parse
15from sre_constants import *
16
17assert _sre.MAGIC == MAGIC, "SRE module mismatch"
18
19_LITERAL_CODES = {LITERAL, NOT_LITERAL}
20_REPEATING_CODES = {REPEAT, MIN_REPEAT, MAX_REPEAT}
21_SUCCESS_CODES = {SUCCESS, FAILURE}
22_ASSERT_CODES = {ASSERT, ASSERT_NOT}
23
24# Sets of lowercase characters which have the same uppercase.
25_equivalences = (
26    # LATIN SMALL LETTER I, LATIN SMALL LETTER DOTLESS I
27    (0x69, 0x131), # iı
28    # LATIN SMALL LETTER S, LATIN SMALL LETTER LONG S
29    (0x73, 0x17f), # sſ
30    # MICRO SIGN, GREEK SMALL LETTER MU
31    (0xb5, 0x3bc), # µμ
32    # COMBINING GREEK YPOGEGRAMMENI, GREEK SMALL LETTER IOTA, GREEK PROSGEGRAMMENI
33    (0x345, 0x3b9, 0x1fbe), # \u0345ιι
34    # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS, GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA
35    (0x390, 0x1fd3), # ΐΐ
36    # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS, GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA
37    (0x3b0, 0x1fe3), # ΰΰ
38    # GREEK SMALL LETTER BETA, GREEK BETA SYMBOL
39    (0x3b2, 0x3d0), # βϐ
40    # GREEK SMALL LETTER EPSILON, GREEK LUNATE EPSILON SYMBOL
41    (0x3b5, 0x3f5), # εϵ
42    # GREEK SMALL LETTER THETA, GREEK THETA SYMBOL
43    (0x3b8, 0x3d1), # θϑ
44    # GREEK SMALL LETTER KAPPA, GREEK KAPPA SYMBOL
45    (0x3ba, 0x3f0), # κϰ
46    # GREEK SMALL LETTER PI, GREEK PI SYMBOL
47    (0x3c0, 0x3d6), # πϖ
48    # GREEK SMALL LETTER RHO, GREEK RHO SYMBOL
49    (0x3c1, 0x3f1), # ρϱ
50    # GREEK SMALL LETTER FINAL SIGMA, GREEK SMALL LETTER SIGMA
51    (0x3c2, 0x3c3), # ςσ
52    # GREEK SMALL LETTER PHI, GREEK PHI SYMBOL
53    (0x3c6, 0x3d5), # φϕ
54    # LATIN SMALL LETTER S WITH DOT ABOVE, LATIN SMALL LETTER LONG S WITH DOT ABOVE
55    (0x1e61, 0x1e9b), # ṡẛ
56    # LATIN SMALL LIGATURE LONG S T, LATIN SMALL LIGATURE ST
57    (0xfb05, 0xfb06), # ſtst
58)
59
60# Maps the lowercase code to lowercase codes which have the same uppercase.
61_ignorecase_fixes = {i: tuple(j for j in t if i != j)
62                     for t in _equivalences for i in t}
63
64def _compile(code, pattern, flags):
65    # internal: compile a (sub)pattern
66    emit = code.append
67    _len = len
68    LITERAL_CODES = _LITERAL_CODES
69    REPEATING_CODES = _REPEATING_CODES
70    SUCCESS_CODES = _SUCCESS_CODES
71    ASSERT_CODES = _ASSERT_CODES
72    if (flags & SRE_FLAG_IGNORECASE and
73            not (flags & SRE_FLAG_LOCALE) and
74            flags & SRE_FLAG_UNICODE and
75            not (flags & SRE_FLAG_ASCII)):
76        fixes = _ignorecase_fixes
77    else:
78        fixes = None
79    for op, av in pattern:
80        if op in LITERAL_CODES:
81            if flags & SRE_FLAG_IGNORECASE:
82                lo = _sre.getlower(av, flags)
83                if fixes and lo in fixes:
84                    emit(IN_IGNORE)
85                    skip = _len(code); emit(0)
86                    if op is NOT_LITERAL:
87                        emit(NEGATE)
88                    for k in (lo,) + fixes[lo]:
89                        emit(LITERAL)
90                        emit(k)
91                    emit(FAILURE)
92                    code[skip] = _len(code) - skip
93                else:
94                    emit(OP_IGNORE[op])
95                    emit(lo)
96            else:
97                emit(op)
98                emit(av)
99        elif op is IN:
100            if flags & SRE_FLAG_IGNORECASE:
101                emit(OP_IGNORE[op])
102                def fixup(literal, flags=flags):
103                    return _sre.getlower(literal, flags)
104            else:
105                emit(op)
106                fixup = None
107            skip = _len(code); emit(0)
108            _compile_charset(av, flags, code, fixup, fixes)
109            code[skip] = _len(code) - skip
110        elif op is ANY:
111            if flags & SRE_FLAG_DOTALL:
112                emit(ANY_ALL)
113            else:
114                emit(ANY)
115        elif op in REPEATING_CODES:
116            if flags & SRE_FLAG_TEMPLATE:
117                raise error("internal: unsupported template operator %r" % (op,))
118            elif _simple(av) and op is not REPEAT:
119                if op is MAX_REPEAT:
120                    emit(REPEAT_ONE)
121                else:
122                    emit(MIN_REPEAT_ONE)
123                skip = _len(code); emit(0)
124                emit(av[0])
125                emit(av[1])
126                _compile(code, av[2], flags)
127                emit(SUCCESS)
128                code[skip] = _len(code) - skip
129            else:
130                emit(REPEAT)
131                skip = _len(code); emit(0)
132                emit(av[0])
133                emit(av[1])
134                _compile(code, av[2], flags)
135                code[skip] = _len(code) - skip
136                if op is MAX_REPEAT:
137                    emit(MAX_UNTIL)
138                else:
139                    emit(MIN_UNTIL)
140        elif op is SUBPATTERN:
141            group, add_flags, del_flags, p = av
142            if group:
143                emit(MARK)
144                emit((group-1)*2)
145            # _compile_info(code, p, (flags | add_flags) & ~del_flags)
146            _compile(code, p, (flags | add_flags) & ~del_flags)
147            if group:
148                emit(MARK)
149                emit((group-1)*2+1)
150        elif op in SUCCESS_CODES:
151            emit(op)
152        elif op in ASSERT_CODES:
153            emit(op)
154            skip = _len(code); emit(0)
155            if av[0] >= 0:
156                emit(0) # look ahead
157            else:
158                lo, hi = av[1].getwidth()
159                if lo != hi:
160                    raise error("look-behind requires fixed-width pattern")
161                emit(lo) # look behind
162            _compile(code, av[1], flags)
163            emit(SUCCESS)
164            code[skip] = _len(code) - skip
165        elif op is CALL:
166            emit(op)
167            skip = _len(code); emit(0)
168            _compile(code, av, flags)
169            emit(SUCCESS)
170            code[skip] = _len(code) - skip
171        elif op is AT:
172            emit(op)
173            if flags & SRE_FLAG_MULTILINE:
174                av = AT_MULTILINE.get(av, av)
175            if flags & SRE_FLAG_LOCALE:
176                av = AT_LOCALE.get(av, av)
177            elif (flags & SRE_FLAG_UNICODE) and not (flags & SRE_FLAG_ASCII):
178                av = AT_UNICODE.get(av, av)
179            emit(av)
180        elif op is BRANCH:
181            emit(op)
182            tail = []
183            tailappend = tail.append
184            for av in av[1]:
185                skip = _len(code); emit(0)
186                # _compile_info(code, av, flags)
187                _compile(code, av, flags)
188                emit(JUMP)
189                tailappend(_len(code)); emit(0)
190                code[skip] = _len(code) - skip
191            emit(FAILURE) # end of branch
192            for tail in tail:
193                code[tail] = _len(code) - tail
194        elif op is CATEGORY:
195            emit(op)
196            if flags & SRE_FLAG_LOCALE:
197                av = CH_LOCALE[av]
198            elif (flags & SRE_FLAG_UNICODE) and not (flags & SRE_FLAG_ASCII):
199                av = CH_UNICODE[av]
200            emit(av)
201        elif op is GROUPREF:
202            if flags & SRE_FLAG_IGNORECASE:
203                emit(OP_IGNORE[op])
204            else:
205                emit(op)
206            emit(av-1)
207        elif op is GROUPREF_EXISTS:
208            emit(op)
209            emit(av[0]-1)
210            skipyes = _len(code); emit(0)
211            _compile(code, av[1], flags)
212            if av[2]:
213                emit(JUMP)
214                skipno = _len(code); emit(0)
215                code[skipyes] = _len(code) - skipyes + 1
216                _compile(code, av[2], flags)
217                code[skipno] = _len(code) - skipno
218            else:
219                code[skipyes] = _len(code) - skipyes + 1
220        else:
221            raise error("internal: unsupported operand type %r" % (op,))
222
223def _compile_charset(charset, flags, code, fixup=None, fixes=None):
224    # compile charset subprogram
225    emit = code.append
226    for op, av in _optimize_charset(charset, fixup, fixes):
227        emit(op)
228        if op is NEGATE:
229            pass
230        elif op is LITERAL:
231            emit(av)
232        elif op is RANGE or op is RANGE_IGNORE:
233            emit(av[0])
234            emit(av[1])
235        elif op is CHARSET:
236            code.extend(av)
237        elif op is BIGCHARSET:
238            code.extend(av)
239        elif op is CATEGORY:
240            if flags & SRE_FLAG_LOCALE:
241                emit(CH_LOCALE[av])
242            elif (flags & SRE_FLAG_UNICODE) and not (flags & SRE_FLAG_ASCII):
243                emit(CH_UNICODE[av])
244            else:
245                emit(av)
246        else:
247            raise error("internal: unsupported set operator %r" % (op,))
248    emit(FAILURE)
249
250def _optimize_charset(charset, fixup, fixes):
251    # internal: optimize character set
252    out = []
253    tail = []
254    charmap = bytearray(256)
255    for op, av in charset:
256        while True:
257            try:
258                if op is LITERAL:
259                    if fixup:
260                        lo = fixup(av)
261                        charmap[lo] = 1
262                        if fixes and lo in fixes:
263                            for k in fixes[lo]:
264                                charmap[k] = 1
265                    else:
266                        charmap[av] = 1
267                elif op is RANGE:
268                    r = range(av[0], av[1]+1)
269                    if fixup:
270                        r = map(fixup, r)
271                    if fixup and fixes:
272                        for i in r:
273                            charmap[i] = 1
274                            if i in fixes:
275                                for k in fixes[i]:
276                                    charmap[k] = 1
277                    else:
278                        for i in r:
279                            charmap[i] = 1
280                elif op is NEGATE:
281                    out.append((op, av))
282                else:
283                    tail.append((op, av))
284            except IndexError:
285                if len(charmap) == 256:
286                    # character set contains non-UCS1 character codes
287                    charmap += b'\0' * 0xff00
288                    continue
289                # Character set contains non-BMP character codes.
290                # There are only two ranges of cased non-BMP characters:
291                # 10400-1044F (Deseret) and 118A0-118DF (Warang Citi),
292                # and for both ranges RANGE_IGNORE works.
293                if fixup and op is RANGE:
294                    op = RANGE_IGNORE
295                tail.append((op, av))
296            break
297
298    # compress character map
299    runs = []
300    q = 0
301    while True:
302        p = charmap.find(1, q)
303        if p < 0:
304            break
305        if len(runs) >= 2:
306            runs = None
307            break
308        q = charmap.find(0, p)
309        if q < 0:
310            runs.append((p, len(charmap)))
311            break
312        runs.append((p, q))
313    if runs is not None:
314        # use literal/range
315        for p, q in runs:
316            if q - p == 1:
317                out.append((LITERAL, p))
318            else:
319                out.append((RANGE, (p, q - 1)))
320        out += tail
321        # if the case was changed or new representation is more compact
322        if fixup or len(out) < len(charset):
323            return out
324        # else original character set is good enough
325        return charset
326
327    # use bitmap
328    if len(charmap) == 256:
329        data = _mk_bitmap(charmap)
330        out.append((CHARSET, data))
331        out += tail
332        return out
333
334    # To represent a big charset, first a bitmap of all characters in the
335    # set is constructed. Then, this bitmap is sliced into chunks of 256
336    # characters, duplicate chunks are eliminated, and each chunk is
337    # given a number. In the compiled expression, the charset is
338    # represented by a 32-bit word sequence, consisting of one word for
339    # the number of different chunks, a sequence of 256 bytes (64 words)
340    # of chunk numbers indexed by their original chunk position, and a
341    # sequence of 256-bit chunks (8 words each).
342
343    # Compression is normally good: in a typical charset, large ranges of
344    # Unicode will be either completely excluded (e.g. if only cyrillic
345    # letters are to be matched), or completely included (e.g. if large
346    # subranges of Kanji match). These ranges will be represented by
347    # chunks of all one-bits or all zero-bits.
348
349    # Matching can be also done efficiently: the more significant byte of
350    # the Unicode character is an index into the chunk number, and the
351    # less significant byte is a bit index in the chunk (just like the
352    # CHARSET matching).
353
354    charmap = bytes(charmap) # should be hashable
355    comps = {}
356    mapping = bytearray(256)
357    block = 0
358    data = bytearray()
359    for i in range(0, 65536, 256):
360        chunk = charmap[i: i + 256]
361        if chunk in comps:
362            mapping[i // 256] = comps[chunk]
363        else:
364            mapping[i // 256] = comps[chunk] = block
365            block += 1
366            data += chunk
367    data = _mk_bitmap(data)
368    data[0:0] = [block] + _bytes_to_codes(mapping)
369    out.append((BIGCHARSET, data))
370    out += tail
371    return out
372
373_CODEBITS = _sre.CODESIZE * 8
374MAXCODE = (1 << _CODEBITS) - 1
375_BITS_TRANS = b'0' + b'1' * 255
376def _mk_bitmap(bits, _CODEBITS=_CODEBITS, _int=int):
377    s = bits.translate(_BITS_TRANS)[::-1]
378    return [_int(s[i - _CODEBITS: i], 2)
379            for i in range(len(s), 0, -_CODEBITS)]
380
381def _bytes_to_codes(b):
382    # Convert block indices to word array
383    a = memoryview(b).cast('I')
384    assert a.itemsize == _sre.CODESIZE
385    assert len(a) * a.itemsize == len(b)
386    return a.tolist()
387
388def _simple(av):
389    # check if av is a "simple" operator
390    lo, hi = av[2].getwidth()
391    return lo == hi == 1 and av[2][0][0] != SUBPATTERN
392
393def _generate_overlap_table(prefix):
394    """
395    Generate an overlap table for the following prefix.
396    An overlap table is a table of the same size as the prefix which
397    informs about the potential self-overlap for each index in the prefix:
398    - if overlap[i] == 0, prefix[i:] can't overlap prefix[0:...]
399    - if overlap[i] == k with 0 < k <= i, prefix[i-k+1:i+1] overlaps with
400      prefix[0:k]
401    """
402    table = [0] * len(prefix)
403    for i in range(1, len(prefix)):
404        idx = table[i - 1]
405        while prefix[i] != prefix[idx]:
406            if idx == 0:
407                table[i] = 0
408                break
409            idx = table[idx - 1]
410        else:
411            table[i] = idx + 1
412    return table
413
414def _get_literal_prefix(pattern):
415    # look for literal prefix
416    prefix = []
417    prefixappend = prefix.append
418    prefix_skip = None
419    for op, av in pattern.data:
420        if op is LITERAL:
421            prefixappend(av)
422        elif op is SUBPATTERN:
423            group, add_flags, del_flags, p = av
424            if add_flags & SRE_FLAG_IGNORECASE:
425                break
426            prefix1, prefix_skip1, got_all = _get_literal_prefix(p)
427            if prefix_skip is None:
428                if group is not None:
429                    prefix_skip = len(prefix)
430                elif prefix_skip1 is not None:
431                    prefix_skip = len(prefix) + prefix_skip1
432            prefix.extend(prefix1)
433            if not got_all:
434                break
435        else:
436            break
437    else:
438        return prefix, prefix_skip, True
439    return prefix, prefix_skip, False
440
441def _get_charset_prefix(pattern):
442    charset = [] # not used
443    charsetappend = charset.append
444    if pattern.data:
445        op, av = pattern.data[0]
446        if op is SUBPATTERN:
447            group, add_flags, del_flags, p = av
448            if p and not (add_flags & SRE_FLAG_IGNORECASE):
449                op, av = p[0]
450                if op is LITERAL:
451                    charsetappend((op, av))
452                elif op is BRANCH:
453                    c = []
454                    cappend = c.append
455                    for p in av[1]:
456                        if not p:
457                            break
458                        op, av = p[0]
459                        if op is LITERAL:
460                            cappend((op, av))
461                        else:
462                            break
463                    else:
464                        charset = c
465        elif op is BRANCH:
466            c = []
467            cappend = c.append
468            for p in av[1]:
469                if not p:
470                    break
471                op, av = p[0]
472                if op is LITERAL:
473                    cappend((op, av))
474                else:
475                    break
476            else:
477                charset = c
478        elif op is IN:
479            charset = av
480    return charset
481
482def _compile_info(code, pattern, flags):
483    # internal: compile an info block.  in the current version,
484    # this contains min/max pattern width, and an optional literal
485    # prefix or a character map
486    lo, hi = pattern.getwidth()
487    if hi > MAXCODE:
488        hi = MAXCODE
489    if lo == 0:
490        code.extend([INFO, 4, 0, lo, hi])
491        return
492    # look for a literal prefix
493    prefix = []
494    prefix_skip = 0
495    charset = [] # not used
496    if not (flags & SRE_FLAG_IGNORECASE):
497        # look for literal prefix
498        prefix, prefix_skip, got_all = _get_literal_prefix(pattern)
499        # if no prefix, look for charset prefix
500        if not prefix:
501            charset = _get_charset_prefix(pattern)
502##     if prefix:
503##         print("*** PREFIX", prefix, prefix_skip)
504##     if charset:
505##         print("*** CHARSET", charset)
506    # add an info block
507    emit = code.append
508    emit(INFO)
509    skip = len(code); emit(0)
510    # literal flag
511    mask = 0
512    if prefix:
513        mask = SRE_INFO_PREFIX
514        if prefix_skip is None and got_all:
515            mask = mask | SRE_INFO_LITERAL
516    elif charset:
517        mask = mask | SRE_INFO_CHARSET
518    emit(mask)
519    # pattern length
520    if lo < MAXCODE:
521        emit(lo)
522    else:
523        emit(MAXCODE)
524        prefix = prefix[:MAXCODE]
525    emit(min(hi, MAXCODE))
526    # add literal prefix
527    if prefix:
528        emit(len(prefix)) # length
529        if prefix_skip is None:
530            prefix_skip =  len(prefix)
531        emit(prefix_skip) # skip
532        code.extend(prefix)
533        # generate overlap table
534        code.extend(_generate_overlap_table(prefix))
535    elif charset:
536        _compile_charset(charset, flags, code)
537    code[skip] = len(code) - skip
538
539def isstring(obj):
540    return isinstance(obj, (str, bytes))
541
542def _code(p, flags):
543
544    flags = p.pattern.flags | flags
545    code = []
546
547    # compile info block
548    _compile_info(code, p, flags)
549
550    # compile the pattern
551    _compile(code, p.data, flags)
552
553    code.append(SUCCESS)
554
555    return code
556
557def compile(p, flags=0):
558    # internal: convert pattern list to internal format
559
560    if isstring(p):
561        pattern = p
562        p = sre_parse.parse(p, flags)
563    else:
564        pattern = None
565
566    code = _code(p, flags)
567
568    # print(code)
569
570    # map in either direction
571    groupindex = p.pattern.groupdict
572    indexgroup = [None] * p.pattern.groups
573    for k, i in groupindex.items():
574        indexgroup[i] = k
575
576    return _sre.compile(
577        pattern, flags | p.pattern.flags, code,
578        p.pattern.groups-1,
579        groupindex, indexgroup
580        )
581