• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#
2# Secret Labs' Regular Expression Engine
3#
4# convert re-style regular expression to sre pattern
5#
6# Copyright (c) 1998-2001 by Secret Labs AB.  All rights reserved.
7#
8# See the __init__.py file for information on usage and redistribution.
9#
10
11"""Internal support module for sre"""
12
13# XXX: show string offset and offending character for all errors
14
15from ._constants import *
16
17SPECIAL_CHARS = ".\\[{()*+?^$|"
18REPEAT_CHARS = "*+?{"
19
20DIGITS = frozenset("0123456789")
21
22OCTDIGITS = frozenset("01234567")
23HEXDIGITS = frozenset("0123456789abcdefABCDEF")
24ASCIILETTERS = frozenset("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ")
25
26WHITESPACE = frozenset(" \t\n\r\v\f")
27
28_REPEATCODES = frozenset({MIN_REPEAT, MAX_REPEAT, POSSESSIVE_REPEAT})
29_UNITCODES = frozenset({ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY})
30
31ESCAPES = {
32    r"\a": (LITERAL, ord("\a")),
33    r"\b": (LITERAL, ord("\b")),
34    r"\f": (LITERAL, ord("\f")),
35    r"\n": (LITERAL, ord("\n")),
36    r"\r": (LITERAL, ord("\r")),
37    r"\t": (LITERAL, ord("\t")),
38    r"\v": (LITERAL, ord("\v")),
39    r"\\": (LITERAL, ord("\\"))
40}
41
42CATEGORIES = {
43    r"\A": (AT, AT_BEGINNING_STRING), # start of string
44    r"\b": (AT, AT_BOUNDARY),
45    r"\B": (AT, AT_NON_BOUNDARY),
46    r"\d": (IN, [(CATEGORY, CATEGORY_DIGIT)]),
47    r"\D": (IN, [(CATEGORY, CATEGORY_NOT_DIGIT)]),
48    r"\s": (IN, [(CATEGORY, CATEGORY_SPACE)]),
49    r"\S": (IN, [(CATEGORY, CATEGORY_NOT_SPACE)]),
50    r"\w": (IN, [(CATEGORY, CATEGORY_WORD)]),
51    r"\W": (IN, [(CATEGORY, CATEGORY_NOT_WORD)]),
52    r"\Z": (AT, AT_END_STRING), # end of string
53}
54
55FLAGS = {
56    # standard flags
57    "i": SRE_FLAG_IGNORECASE,
58    "L": SRE_FLAG_LOCALE,
59    "m": SRE_FLAG_MULTILINE,
60    "s": SRE_FLAG_DOTALL,
61    "x": SRE_FLAG_VERBOSE,
62    # extensions
63    "a": SRE_FLAG_ASCII,
64    "u": SRE_FLAG_UNICODE,
65}
66
67TYPE_FLAGS = SRE_FLAG_ASCII | SRE_FLAG_LOCALE | SRE_FLAG_UNICODE
68GLOBAL_FLAGS = SRE_FLAG_DEBUG
69
70# Maximal value returned by SubPattern.getwidth().
71# Must be larger than MAXREPEAT, MAXCODE and sys.maxsize.
72MAXWIDTH = 1 << 64
73
74class State:
75    # keeps track of state for parsing
76    def __init__(self):
77        self.flags = 0
78        self.groupdict = {}
79        self.groupwidths = [None]  # group 0
80        self.lookbehindgroups = None
81        self.grouprefpos = {}
82    @property
83    def groups(self):
84        return len(self.groupwidths)
85    def opengroup(self, name=None):
86        gid = self.groups
87        self.groupwidths.append(None)
88        if self.groups > MAXGROUPS:
89            raise error("too many groups")
90        if name is not None:
91            ogid = self.groupdict.get(name, None)
92            if ogid is not None:
93                raise error("redefinition of group name %r as group %d; "
94                            "was group %d" % (name, gid,  ogid))
95            self.groupdict[name] = gid
96        return gid
97    def closegroup(self, gid, p):
98        self.groupwidths[gid] = p.getwidth()
99    def checkgroup(self, gid):
100        return gid < self.groups and self.groupwidths[gid] is not None
101
102    def checklookbehindgroup(self, gid, source):
103        if self.lookbehindgroups is not None:
104            if not self.checkgroup(gid):
105                raise source.error('cannot refer to an open group')
106            if gid >= self.lookbehindgroups:
107                raise source.error('cannot refer to group defined in the same '
108                                   'lookbehind subpattern')
109
110class SubPattern:
111    # a subpattern, in intermediate form
112    def __init__(self, state, data=None):
113        self.state = state
114        if data is None:
115            data = []
116        self.data = data
117        self.width = None
118
119    def dump(self, level=0):
120        seqtypes = (tuple, list)
121        for op, av in self.data:
122            print(level*"  " + str(op), end='')
123            if op is IN:
124                # member sublanguage
125                print()
126                for op, a in av:
127                    print((level+1)*"  " + str(op), a)
128            elif op is BRANCH:
129                print()
130                for i, a in enumerate(av[1]):
131                    if i:
132                        print(level*"  " + "OR")
133                    a.dump(level+1)
134            elif op is GROUPREF_EXISTS:
135                condgroup, item_yes, item_no = av
136                print('', condgroup)
137                item_yes.dump(level+1)
138                if item_no:
139                    print(level*"  " + "ELSE")
140                    item_no.dump(level+1)
141            elif isinstance(av, SubPattern):
142                print()
143                av.dump(level+1)
144            elif isinstance(av, seqtypes):
145                nl = False
146                for a in av:
147                    if isinstance(a, SubPattern):
148                        if not nl:
149                            print()
150                        a.dump(level+1)
151                        nl = True
152                    else:
153                        if not nl:
154                            print(' ', end='')
155                        print(a, end='')
156                        nl = False
157                if not nl:
158                    print()
159            else:
160                print('', av)
161    def __repr__(self):
162        return repr(self.data)
163    def __len__(self):
164        return len(self.data)
165    def __delitem__(self, index):
166        del self.data[index]
167    def __getitem__(self, index):
168        if isinstance(index, slice):
169            return SubPattern(self.state, self.data[index])
170        return self.data[index]
171    def __setitem__(self, index, code):
172        self.data[index] = code
173    def insert(self, index, code):
174        self.data.insert(index, code)
175    def append(self, code):
176        self.data.append(code)
177    def getwidth(self):
178        # determine the width (min, max) for this subpattern
179        if self.width is not None:
180            return self.width
181        lo = hi = 0
182        for op, av in self.data:
183            if op is BRANCH:
184                i = MAXWIDTH
185                j = 0
186                for av in av[1]:
187                    l, h = av.getwidth()
188                    i = min(i, l)
189                    j = max(j, h)
190                lo = lo + i
191                hi = hi + j
192            elif op is ATOMIC_GROUP:
193                i, j = av.getwidth()
194                lo = lo + i
195                hi = hi + j
196            elif op is SUBPATTERN:
197                i, j = av[-1].getwidth()
198                lo = lo + i
199                hi = hi + j
200            elif op in _REPEATCODES:
201                i, j = av[2].getwidth()
202                lo = lo + i * av[0]
203                if av[1] == MAXREPEAT and j:
204                    hi = MAXWIDTH
205                else:
206                    hi = hi + j * av[1]
207            elif op in _UNITCODES:
208                lo = lo + 1
209                hi = hi + 1
210            elif op is GROUPREF:
211                i, j = self.state.groupwidths[av]
212                lo = lo + i
213                hi = hi + j
214            elif op is GROUPREF_EXISTS:
215                i, j = av[1].getwidth()
216                if av[2] is not None:
217                    l, h = av[2].getwidth()
218                    i = min(i, l)
219                    j = max(j, h)
220                else:
221                    i = 0
222                lo = lo + i
223                hi = hi + j
224            elif op is SUCCESS:
225                break
226        self.width = min(lo, MAXWIDTH), min(hi, MAXWIDTH)
227        return self.width
228
229class Tokenizer:
230    def __init__(self, string):
231        self.istext = isinstance(string, str)
232        self.string = string
233        if not self.istext:
234            string = str(string, 'latin1')
235        self.decoded_string = string
236        self.index = 0
237        self.next = None
238        self.__next()
239    def __next(self):
240        index = self.index
241        try:
242            char = self.decoded_string[index]
243        except IndexError:
244            self.next = None
245            return
246        if char == "\\":
247            index += 1
248            try:
249                char += self.decoded_string[index]
250            except IndexError:
251                raise error("bad escape (end of pattern)",
252                            self.string, len(self.string) - 1) from None
253        self.index = index + 1
254        self.next = char
255    def match(self, char):
256        if char == self.next:
257            self.__next()
258            return True
259        return False
260    def get(self):
261        this = self.next
262        self.__next()
263        return this
264    def getwhile(self, n, charset):
265        result = ''
266        for _ in range(n):
267            c = self.next
268            if c not in charset:
269                break
270            result += c
271            self.__next()
272        return result
273    def getuntil(self, terminator, name):
274        result = ''
275        while True:
276            c = self.next
277            self.__next()
278            if c is None:
279                if not result:
280                    raise self.error("missing " + name)
281                raise self.error("missing %s, unterminated name" % terminator,
282                                 len(result))
283            if c == terminator:
284                if not result:
285                    raise self.error("missing " + name, 1)
286                break
287            result += c
288        return result
289    @property
290    def pos(self):
291        return self.index - len(self.next or '')
292    def tell(self):
293        return self.index - len(self.next or '')
294    def seek(self, index):
295        self.index = index
296        self.__next()
297
298    def error(self, msg, offset=0):
299        if not self.istext:
300            msg = msg.encode('ascii', 'backslashreplace').decode('ascii')
301        return error(msg, self.string, self.tell() - offset)
302
303    def checkgroupname(self, name, offset):
304        if not (self.istext or name.isascii()):
305            msg = "bad character in group name %a" % name
306            raise self.error(msg, len(name) + offset)
307        if not name.isidentifier():
308            msg = "bad character in group name %r" % name
309            raise self.error(msg, len(name) + offset)
310
311def _class_escape(source, escape):
312    # handle escape code inside character class
313    code = ESCAPES.get(escape)
314    if code:
315        return code
316    code = CATEGORIES.get(escape)
317    if code and code[0] is IN:
318        return code
319    try:
320        c = escape[1:2]
321        if c == "x":
322            # hexadecimal escape (exactly two digits)
323            escape += source.getwhile(2, HEXDIGITS)
324            if len(escape) != 4:
325                raise source.error("incomplete escape %s" % escape, len(escape))
326            return LITERAL, int(escape[2:], 16)
327        elif c == "u" and source.istext:
328            # unicode escape (exactly four digits)
329            escape += source.getwhile(4, HEXDIGITS)
330            if len(escape) != 6:
331                raise source.error("incomplete escape %s" % escape, len(escape))
332            return LITERAL, int(escape[2:], 16)
333        elif c == "U" and source.istext:
334            # unicode escape (exactly eight digits)
335            escape += source.getwhile(8, HEXDIGITS)
336            if len(escape) != 10:
337                raise source.error("incomplete escape %s" % escape, len(escape))
338            c = int(escape[2:], 16)
339            chr(c) # raise ValueError for invalid code
340            return LITERAL, c
341        elif c == "N" and source.istext:
342            import unicodedata
343            # named unicode escape e.g. \N{EM DASH}
344            if not source.match('{'):
345                raise source.error("missing {")
346            charname = source.getuntil('}', 'character name')
347            try:
348                c = ord(unicodedata.lookup(charname))
349            except (KeyError, TypeError):
350                raise source.error("undefined character name %r" % charname,
351                                   len(charname) + len(r'\N{}')) from None
352            return LITERAL, c
353        elif c in OCTDIGITS:
354            # octal escape (up to three digits)
355            escape += source.getwhile(2, OCTDIGITS)
356            c = int(escape[1:], 8)
357            if c > 0o377:
358                raise source.error('octal escape value %s outside of '
359                                   'range 0-0o377' % escape, len(escape))
360            return LITERAL, c
361        elif c in DIGITS:
362            raise ValueError
363        if len(escape) == 2:
364            if c in ASCIILETTERS:
365                raise source.error('bad escape %s' % escape, len(escape))
366            return LITERAL, ord(escape[1])
367    except ValueError:
368        pass
369    raise source.error("bad escape %s" % escape, len(escape))
370
371def _escape(source, escape, state):
372    # handle escape code in expression
373    code = CATEGORIES.get(escape)
374    if code:
375        return code
376    code = ESCAPES.get(escape)
377    if code:
378        return code
379    try:
380        c = escape[1:2]
381        if c == "x":
382            # hexadecimal escape
383            escape += source.getwhile(2, HEXDIGITS)
384            if len(escape) != 4:
385                raise source.error("incomplete escape %s" % escape, len(escape))
386            return LITERAL, int(escape[2:], 16)
387        elif c == "u" and source.istext:
388            # unicode escape (exactly four digits)
389            escape += source.getwhile(4, HEXDIGITS)
390            if len(escape) != 6:
391                raise source.error("incomplete escape %s" % escape, len(escape))
392            return LITERAL, int(escape[2:], 16)
393        elif c == "U" and source.istext:
394            # unicode escape (exactly eight digits)
395            escape += source.getwhile(8, HEXDIGITS)
396            if len(escape) != 10:
397                raise source.error("incomplete escape %s" % escape, len(escape))
398            c = int(escape[2:], 16)
399            chr(c) # raise ValueError for invalid code
400            return LITERAL, c
401        elif c == "N" and source.istext:
402            import unicodedata
403            # named unicode escape e.g. \N{EM DASH}
404            if not source.match('{'):
405                raise source.error("missing {")
406            charname = source.getuntil('}', 'character name')
407            try:
408                c = ord(unicodedata.lookup(charname))
409            except (KeyError, TypeError):
410                raise source.error("undefined character name %r" % charname,
411                                   len(charname) + len(r'\N{}')) from None
412            return LITERAL, c
413        elif c == "0":
414            # octal escape
415            escape += source.getwhile(2, OCTDIGITS)
416            return LITERAL, int(escape[1:], 8)
417        elif c in DIGITS:
418            # octal escape *or* decimal group reference (sigh)
419            if source.next in DIGITS:
420                escape += source.get()
421                if (escape[1] in OCTDIGITS and escape[2] in OCTDIGITS and
422                    source.next in OCTDIGITS):
423                    # got three octal digits; this is an octal escape
424                    escape += source.get()
425                    c = int(escape[1:], 8)
426                    if c > 0o377:
427                        raise source.error('octal escape value %s outside of '
428                                           'range 0-0o377' % escape,
429                                           len(escape))
430                    return LITERAL, c
431            # not an octal escape, so this is a group reference
432            group = int(escape[1:])
433            if group < state.groups:
434                if not state.checkgroup(group):
435                    raise source.error("cannot refer to an open group",
436                                       len(escape))
437                state.checklookbehindgroup(group, source)
438                return GROUPREF, group
439            raise source.error("invalid group reference %d" % group, len(escape) - 1)
440        if len(escape) == 2:
441            if c in ASCIILETTERS:
442                raise source.error("bad escape %s" % escape, len(escape))
443            return LITERAL, ord(escape[1])
444    except ValueError:
445        pass
446    raise source.error("bad escape %s" % escape, len(escape))
447
448def _uniq(items):
449    return list(dict.fromkeys(items))
450
451def _parse_sub(source, state, verbose, nested):
452    # parse an alternation: a|b|c
453
454    items = []
455    itemsappend = items.append
456    sourcematch = source.match
457    start = source.tell()
458    while True:
459        itemsappend(_parse(source, state, verbose, nested + 1,
460                           not nested and not items))
461        if not sourcematch("|"):
462            break
463        if not nested:
464            verbose = state.flags & SRE_FLAG_VERBOSE
465
466    if len(items) == 1:
467        return items[0]
468
469    subpattern = SubPattern(state)
470
471    # check if all items share a common prefix
472    while True:
473        prefix = None
474        for item in items:
475            if not item:
476                break
477            if prefix is None:
478                prefix = item[0]
479            elif item[0] != prefix:
480                break
481        else:
482            # all subitems start with a common "prefix".
483            # move it out of the branch
484            for item in items:
485                del item[0]
486            subpattern.append(prefix)
487            continue # check next one
488        break
489
490    # check if the branch can be replaced by a character set
491    set = []
492    for item in items:
493        if len(item) != 1:
494            break
495        op, av = item[0]
496        if op is LITERAL:
497            set.append((op, av))
498        elif op is IN and av[0][0] is not NEGATE:
499            set.extend(av)
500        else:
501            break
502    else:
503        # we can store this as a character set instead of a
504        # branch (the compiler may optimize this even more)
505        subpattern.append((IN, _uniq(set)))
506        return subpattern
507
508    subpattern.append((BRANCH, (None, items)))
509    return subpattern
510
511def _parse(source, state, verbose, nested, first=False):
512    # parse a simple pattern
513    subpattern = SubPattern(state)
514
515    # precompute constants into local variables
516    subpatternappend = subpattern.append
517    sourceget = source.get
518    sourcematch = source.match
519    _len = len
520    _ord = ord
521
522    while True:
523
524        this = source.next
525        if this is None:
526            break # end of pattern
527        if this in "|)":
528            break # end of subpattern
529        sourceget()
530
531        if verbose:
532            # skip whitespace and comments
533            if this in WHITESPACE:
534                continue
535            if this == "#":
536                while True:
537                    this = sourceget()
538                    if this is None or this == "\n":
539                        break
540                continue
541
542        if this[0] == "\\":
543            code = _escape(source, this, state)
544            subpatternappend(code)
545
546        elif this not in SPECIAL_CHARS:
547            subpatternappend((LITERAL, _ord(this)))
548
549        elif this == "[":
550            here = source.tell() - 1
551            # character set
552            set = []
553            setappend = set.append
554##          if sourcematch(":"):
555##              pass # handle character classes
556            if source.next == '[':
557                import warnings
558                warnings.warn(
559                    'Possible nested set at position %d' % source.tell(),
560                    FutureWarning, stacklevel=nested + 6
561                )
562            negate = sourcematch("^")
563            # check remaining characters
564            while True:
565                this = sourceget()
566                if this is None:
567                    raise source.error("unterminated character set",
568                                       source.tell() - here)
569                if this == "]" and set:
570                    break
571                elif this[0] == "\\":
572                    code1 = _class_escape(source, this)
573                else:
574                    if set and this in '-&~|' and source.next == this:
575                        import warnings
576                        warnings.warn(
577                            'Possible set %s at position %d' % (
578                                'difference' if this == '-' else
579                                'intersection' if this == '&' else
580                                'symmetric difference' if this == '~' else
581                                'union',
582                                source.tell() - 1),
583                            FutureWarning, stacklevel=nested + 6
584                        )
585                    code1 = LITERAL, _ord(this)
586                if sourcematch("-"):
587                    # potential range
588                    that = sourceget()
589                    if that is None:
590                        raise source.error("unterminated character set",
591                                           source.tell() - here)
592                    if that == "]":
593                        if code1[0] is IN:
594                            code1 = code1[1][0]
595                        setappend(code1)
596                        setappend((LITERAL, _ord("-")))
597                        break
598                    if that[0] == "\\":
599                        code2 = _class_escape(source, that)
600                    else:
601                        if that == '-':
602                            import warnings
603                            warnings.warn(
604                                'Possible set difference at position %d' % (
605                                    source.tell() - 2),
606                                FutureWarning, stacklevel=nested + 6
607                            )
608                        code2 = LITERAL, _ord(that)
609                    if code1[0] != LITERAL or code2[0] != LITERAL:
610                        msg = "bad character range %s-%s" % (this, that)
611                        raise source.error(msg, len(this) + 1 + len(that))
612                    lo = code1[1]
613                    hi = code2[1]
614                    if hi < lo:
615                        msg = "bad character range %s-%s" % (this, that)
616                        raise source.error(msg, len(this) + 1 + len(that))
617                    setappend((RANGE, (lo, hi)))
618                else:
619                    if code1[0] is IN:
620                        code1 = code1[1][0]
621                    setappend(code1)
622
623            set = _uniq(set)
624            # XXX: <fl> should move set optimization to compiler!
625            if _len(set) == 1 and set[0][0] is LITERAL:
626                # optimization
627                if negate:
628                    subpatternappend((NOT_LITERAL, set[0][1]))
629                else:
630                    subpatternappend(set[0])
631            else:
632                if negate:
633                    set.insert(0, (NEGATE, None))
634                # charmap optimization can't be added here because
635                # global flags still are not known
636                subpatternappend((IN, set))
637
638        elif this in REPEAT_CHARS:
639            # repeat previous item
640            here = source.tell()
641            if this == "?":
642                min, max = 0, 1
643            elif this == "*":
644                min, max = 0, MAXREPEAT
645
646            elif this == "+":
647                min, max = 1, MAXREPEAT
648            elif this == "{":
649                if source.next == "}":
650                    subpatternappend((LITERAL, _ord(this)))
651                    continue
652
653                min, max = 0, MAXREPEAT
654                lo = hi = ""
655                while source.next in DIGITS:
656                    lo += sourceget()
657                if sourcematch(","):
658                    while source.next in DIGITS:
659                        hi += sourceget()
660                else:
661                    hi = lo
662                if not sourcematch("}"):
663                    subpatternappend((LITERAL, _ord(this)))
664                    source.seek(here)
665                    continue
666
667                if lo:
668                    min = int(lo)
669                    if min >= MAXREPEAT:
670                        raise OverflowError("the repetition number is too large")
671                if hi:
672                    max = int(hi)
673                    if max >= MAXREPEAT:
674                        raise OverflowError("the repetition number is too large")
675                    if max < min:
676                        raise source.error("min repeat greater than max repeat",
677                                           source.tell() - here)
678            else:
679                raise AssertionError("unsupported quantifier %r" % (char,))
680            # figure out which item to repeat
681            if subpattern:
682                item = subpattern[-1:]
683            else:
684                item = None
685            if not item or item[0][0] is AT:
686                raise source.error("nothing to repeat",
687                                   source.tell() - here + len(this))
688            if item[0][0] in _REPEATCODES:
689                raise source.error("multiple repeat",
690                                   source.tell() - here + len(this))
691            if item[0][0] is SUBPATTERN:
692                group, add_flags, del_flags, p = item[0][1]
693                if group is None and not add_flags and not del_flags:
694                    item = p
695            if sourcematch("?"):
696                # Non-Greedy Match
697                subpattern[-1] = (MIN_REPEAT, (min, max, item))
698            elif sourcematch("+"):
699                # Possessive Match (Always Greedy)
700                subpattern[-1] = (POSSESSIVE_REPEAT, (min, max, item))
701            else:
702                # Greedy Match
703                subpattern[-1] = (MAX_REPEAT, (min, max, item))
704
705        elif this == ".":
706            subpatternappend((ANY, None))
707
708        elif this == "(":
709            start = source.tell() - 1
710            capture = True
711            atomic = False
712            name = None
713            add_flags = 0
714            del_flags = 0
715            if sourcematch("?"):
716                # options
717                char = sourceget()
718                if char is None:
719                    raise source.error("unexpected end of pattern")
720                if char == "P":
721                    # python extensions
722                    if sourcematch("<"):
723                        # named group: skip forward to end of name
724                        name = source.getuntil(">", "group name")
725                        source.checkgroupname(name, 1)
726                    elif sourcematch("="):
727                        # named backreference
728                        name = source.getuntil(")", "group name")
729                        source.checkgroupname(name, 1)
730                        gid = state.groupdict.get(name)
731                        if gid is None:
732                            msg = "unknown group name %r" % name
733                            raise source.error(msg, len(name) + 1)
734                        if not state.checkgroup(gid):
735                            raise source.error("cannot refer to an open group",
736                                               len(name) + 1)
737                        state.checklookbehindgroup(gid, source)
738                        subpatternappend((GROUPREF, gid))
739                        continue
740
741                    else:
742                        char = sourceget()
743                        if char is None:
744                            raise source.error("unexpected end of pattern")
745                        raise source.error("unknown extension ?P" + char,
746                                           len(char) + 2)
747                elif char == ":":
748                    # non-capturing group
749                    capture = False
750                elif char == "#":
751                    # comment
752                    while True:
753                        if source.next is None:
754                            raise source.error("missing ), unterminated comment",
755                                               source.tell() - start)
756                        if sourceget() == ")":
757                            break
758                    continue
759
760                elif char in "=!<":
761                    # lookahead assertions
762                    dir = 1
763                    if char == "<":
764                        char = sourceget()
765                        if char is None:
766                            raise source.error("unexpected end of pattern")
767                        if char not in "=!":
768                            raise source.error("unknown extension ?<" + char,
769                                               len(char) + 2)
770                        dir = -1 # lookbehind
771                        lookbehindgroups = state.lookbehindgroups
772                        if lookbehindgroups is None:
773                            state.lookbehindgroups = state.groups
774                    p = _parse_sub(source, state, verbose, nested + 1)
775                    if dir < 0:
776                        if lookbehindgroups is None:
777                            state.lookbehindgroups = None
778                    if not sourcematch(")"):
779                        raise source.error("missing ), unterminated subpattern",
780                                           source.tell() - start)
781                    if char == "=":
782                        subpatternappend((ASSERT, (dir, p)))
783                    elif p:
784                        subpatternappend((ASSERT_NOT, (dir, p)))
785                    else:
786                        subpatternappend((FAILURE, ()))
787                    continue
788
789                elif char == "(":
790                    # conditional backreference group
791                    condname = source.getuntil(")", "group name")
792                    if not (condname.isdecimal() and condname.isascii()):
793                        source.checkgroupname(condname, 1)
794                        condgroup = state.groupdict.get(condname)
795                        if condgroup is None:
796                            msg = "unknown group name %r" % condname
797                            raise source.error(msg, len(condname) + 1)
798                    else:
799                        condgroup = int(condname)
800                        if not condgroup:
801                            raise source.error("bad group number",
802                                               len(condname) + 1)
803                        if condgroup >= MAXGROUPS:
804                            msg = "invalid group reference %d" % condgroup
805                            raise source.error(msg, len(condname) + 1)
806                        if condgroup not in state.grouprefpos:
807                            state.grouprefpos[condgroup] = (
808                                source.tell() - len(condname) - 1
809                            )
810                        if not (condname.isdecimal() and condname.isascii()):
811                            import warnings
812                            warnings.warn(
813                                "bad character in group name %s at position %d" %
814                                (repr(condname) if source.istext else ascii(condname),
815                                 source.tell() - len(condname) - 1),
816                                DeprecationWarning, stacklevel=nested + 6
817                            )
818                    state.checklookbehindgroup(condgroup, source)
819                    item_yes = _parse(source, state, verbose, nested + 1)
820                    if source.match("|"):
821                        item_no = _parse(source, state, verbose, nested + 1)
822                        if source.next == "|":
823                            raise source.error("conditional backref with more than two branches")
824                    else:
825                        item_no = None
826                    if not source.match(")"):
827                        raise source.error("missing ), unterminated subpattern",
828                                           source.tell() - start)
829                    subpatternappend((GROUPREF_EXISTS, (condgroup, item_yes, item_no)))
830                    continue
831
832                elif char == ">":
833                    # non-capturing, atomic group
834                    capture = False
835                    atomic = True
836                elif char in FLAGS or char == "-":
837                    # flags
838                    flags = _parse_flags(source, state, char)
839                    if flags is None:  # global flags
840                        if not first or subpattern:
841                            raise source.error('global flags not at the start '
842                                               'of the expression',
843                                               source.tell() - start)
844                        verbose = state.flags & SRE_FLAG_VERBOSE
845                        continue
846
847                    add_flags, del_flags = flags
848                    capture = False
849                else:
850                    raise source.error("unknown extension ?" + char,
851                                       len(char) + 1)
852
853            # parse group contents
854            if capture:
855                try:
856                    group = state.opengroup(name)
857                except error as err:
858                    raise source.error(err.msg, len(name) + 1) from None
859            else:
860                group = None
861            sub_verbose = ((verbose or (add_flags & SRE_FLAG_VERBOSE)) and
862                           not (del_flags & SRE_FLAG_VERBOSE))
863            p = _parse_sub(source, state, sub_verbose, nested + 1)
864            if not source.match(")"):
865                raise source.error("missing ), unterminated subpattern",
866                                   source.tell() - start)
867            if group is not None:
868                state.closegroup(group, p)
869            if atomic:
870                assert group is None
871                subpatternappend((ATOMIC_GROUP, p))
872            else:
873                subpatternappend((SUBPATTERN, (group, add_flags, del_flags, p)))
874
875        elif this == "^":
876            subpatternappend((AT, AT_BEGINNING))
877
878        elif this == "$":
879            subpatternappend((AT, AT_END))
880
881        else:
882            raise AssertionError("unsupported special character %r" % (char,))
883
884    # unpack non-capturing groups
885    for i in range(len(subpattern))[::-1]:
886        op, av = subpattern[i]
887        if op is SUBPATTERN:
888            group, add_flags, del_flags, p = av
889            if group is None and not add_flags and not del_flags:
890                subpattern[i: i+1] = p
891
892    return subpattern
893
894def _parse_flags(source, state, char):
895    sourceget = source.get
896    add_flags = 0
897    del_flags = 0
898    if char != "-":
899        while True:
900            flag = FLAGS[char]
901            if source.istext:
902                if char == 'L':
903                    msg = "bad inline flags: cannot use 'L' flag with a str pattern"
904                    raise source.error(msg)
905            else:
906                if char == 'u':
907                    msg = "bad inline flags: cannot use 'u' flag with a bytes pattern"
908                    raise source.error(msg)
909            add_flags |= flag
910            if (flag & TYPE_FLAGS) and (add_flags & TYPE_FLAGS) != flag:
911                msg = "bad inline flags: flags 'a', 'u' and 'L' are incompatible"
912                raise source.error(msg)
913            char = sourceget()
914            if char is None:
915                raise source.error("missing -, : or )")
916            if char in ")-:":
917                break
918            if char not in FLAGS:
919                msg = "unknown flag" if char.isalpha() else "missing -, : or )"
920                raise source.error(msg, len(char))
921    if char == ")":
922        state.flags |= add_flags
923        return None
924    if add_flags & GLOBAL_FLAGS:
925        raise source.error("bad inline flags: cannot turn on global flag", 1)
926    if char == "-":
927        char = sourceget()
928        if char is None:
929            raise source.error("missing flag")
930        if char not in FLAGS:
931            msg = "unknown flag" if char.isalpha() else "missing flag"
932            raise source.error(msg, len(char))
933        while True:
934            flag = FLAGS[char]
935            if flag & TYPE_FLAGS:
936                msg = "bad inline flags: cannot turn off flags 'a', 'u' and 'L'"
937                raise source.error(msg)
938            del_flags |= flag
939            char = sourceget()
940            if char is None:
941                raise source.error("missing :")
942            if char == ":":
943                break
944            if char not in FLAGS:
945                msg = "unknown flag" if char.isalpha() else "missing :"
946                raise source.error(msg, len(char))
947    assert char == ":"
948    if del_flags & GLOBAL_FLAGS:
949        raise source.error("bad inline flags: cannot turn off global flag", 1)
950    if add_flags & del_flags:
951        raise source.error("bad inline flags: flag turned on and off", 1)
952    return add_flags, del_flags
953
954def fix_flags(src, flags):
955    # Check and fix flags according to the type of pattern (str or bytes)
956    if isinstance(src, str):
957        if flags & SRE_FLAG_LOCALE:
958            raise ValueError("cannot use LOCALE flag with a str pattern")
959        if not flags & SRE_FLAG_ASCII:
960            flags |= SRE_FLAG_UNICODE
961        elif flags & SRE_FLAG_UNICODE:
962            raise ValueError("ASCII and UNICODE flags are incompatible")
963    else:
964        if flags & SRE_FLAG_UNICODE:
965            raise ValueError("cannot use UNICODE flag with a bytes pattern")
966        if flags & SRE_FLAG_LOCALE and flags & SRE_FLAG_ASCII:
967            raise ValueError("ASCII and LOCALE flags are incompatible")
968    return flags
969
970def parse(str, flags=0, state=None):
971    # parse 're' pattern into list of (opcode, argument) tuples
972
973    source = Tokenizer(str)
974
975    if state is None:
976        state = State()
977    state.flags = flags
978    state.str = str
979
980    p = _parse_sub(source, state, flags & SRE_FLAG_VERBOSE, 0)
981    p.state.flags = fix_flags(str, p.state.flags)
982
983    if source.next is not None:
984        assert source.next == ")"
985        raise source.error("unbalanced parenthesis")
986
987    for g in p.state.grouprefpos:
988        if g >= p.state.groups:
989            msg = "invalid group reference %d" % g
990            raise error(msg, str, p.state.grouprefpos[g])
991
992    if flags & SRE_FLAG_DEBUG:
993        p.dump()
994
995    return p
996
997def parse_template(source, pattern):
998    # parse 're' replacement string into list of literals and
999    # group references
1000    s = Tokenizer(source)
1001    sget = s.get
1002    result = []
1003    literal = []
1004    lappend = literal.append
1005    def addliteral():
1006        if s.istext:
1007            result.append(''.join(literal))
1008        else:
1009            # The tokenizer implicitly decodes bytes objects as latin-1, we must
1010            # therefore re-encode the final representation.
1011            result.append(''.join(literal).encode('latin-1'))
1012        del literal[:]
1013    def addgroup(index, pos):
1014        if index > pattern.groups:
1015            raise s.error("invalid group reference %d" % index, pos)
1016        addliteral()
1017        result.append(index)
1018    groupindex = pattern.groupindex
1019    while True:
1020        this = sget()
1021        if this is None:
1022            break # end of replacement string
1023        if this[0] == "\\":
1024            # group
1025            c = this[1]
1026            if c == "g":
1027                if not s.match("<"):
1028                    raise s.error("missing <")
1029                name = s.getuntil(">", "group name")
1030                if not (name.isdecimal() and name.isascii()):
1031                    s.checkgroupname(name, 1)
1032                    try:
1033                        index = groupindex[name]
1034                    except KeyError:
1035                        raise IndexError("unknown group name %r" % name) from None
1036                else:
1037                    index = int(name)
1038                    if index >= MAXGROUPS:
1039                        raise s.error("invalid group reference %d" % index,
1040                                      len(name) + 1)
1041                    if not (name.isdecimal() and name.isascii()):
1042                        import warnings
1043                        warnings.warn(
1044                            "bad character in group name %s at position %d" %
1045                            (repr(name) if s.istext else ascii(name),
1046                             s.tell() - len(name) - 1),
1047                            DeprecationWarning, stacklevel=5
1048                        )
1049                addgroup(index, len(name) + 1)
1050            elif c == "0":
1051                if s.next in OCTDIGITS:
1052                    this += sget()
1053                    if s.next in OCTDIGITS:
1054                        this += sget()
1055                lappend(chr(int(this[1:], 8) & 0xff))
1056            elif c in DIGITS:
1057                isoctal = False
1058                if s.next in DIGITS:
1059                    this += sget()
1060                    if (c in OCTDIGITS and this[2] in OCTDIGITS and
1061                        s.next in OCTDIGITS):
1062                        this += sget()
1063                        isoctal = True
1064                        c = int(this[1:], 8)
1065                        if c > 0o377:
1066                            raise s.error('octal escape value %s outside of '
1067                                          'range 0-0o377' % this, len(this))
1068                        lappend(chr(c))
1069                if not isoctal:
1070                    addgroup(int(this[1:]), len(this) - 1)
1071            else:
1072                try:
1073                    this = chr(ESCAPES[this][1])
1074                except KeyError:
1075                    if c in ASCIILETTERS:
1076                        raise s.error('bad escape %s' % this, len(this)) from None
1077                lappend(this)
1078        else:
1079            lappend(this)
1080    addliteral()
1081    return result
1082