• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#
2# Secret Labs' Regular Expression Engine
3#
4# convert re-style regular expression to sre pattern
5#
6# Copyright (c) 1998-2001 by Secret Labs AB.  All rights reserved.
7#
8# See the sre.py file for information on usage and redistribution.
9#
10
11"""Internal support module for sre"""
12
13# XXX: show string offset and offending character for all errors
14
15from sre_constants import *
16
17SPECIAL_CHARS = ".\\[{()*+?^$|"
18REPEAT_CHARS = "*+?{"
19
20DIGITS = frozenset("0123456789")
21
22OCTDIGITS = frozenset("01234567")
23HEXDIGITS = frozenset("0123456789abcdefABCDEF")
24ASCIILETTERS = frozenset("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ")
25
26WHITESPACE = frozenset(" \t\n\r\v\f")
27
28_REPEATCODES = frozenset({MIN_REPEAT, MAX_REPEAT})
29_UNITCODES = frozenset({ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY})
30
31ESCAPES = {
32    r"\a": (LITERAL, ord("\a")),
33    r"\b": (LITERAL, ord("\b")),
34    r"\f": (LITERAL, ord("\f")),
35    r"\n": (LITERAL, ord("\n")),
36    r"\r": (LITERAL, ord("\r")),
37    r"\t": (LITERAL, ord("\t")),
38    r"\v": (LITERAL, ord("\v")),
39    r"\\": (LITERAL, ord("\\"))
40}
41
42CATEGORIES = {
43    r"\A": (AT, AT_BEGINNING_STRING), # start of string
44    r"\b": (AT, AT_BOUNDARY),
45    r"\B": (AT, AT_NON_BOUNDARY),
46    r"\d": (IN, [(CATEGORY, CATEGORY_DIGIT)]),
47    r"\D": (IN, [(CATEGORY, CATEGORY_NOT_DIGIT)]),
48    r"\s": (IN, [(CATEGORY, CATEGORY_SPACE)]),
49    r"\S": (IN, [(CATEGORY, CATEGORY_NOT_SPACE)]),
50    r"\w": (IN, [(CATEGORY, CATEGORY_WORD)]),
51    r"\W": (IN, [(CATEGORY, CATEGORY_NOT_WORD)]),
52    r"\Z": (AT, AT_END_STRING), # end of string
53}
54
55FLAGS = {
56    # standard flags
57    "i": SRE_FLAG_IGNORECASE,
58    "L": SRE_FLAG_LOCALE,
59    "m": SRE_FLAG_MULTILINE,
60    "s": SRE_FLAG_DOTALL,
61    "x": SRE_FLAG_VERBOSE,
62    # extensions
63    "a": SRE_FLAG_ASCII,
64    "t": SRE_FLAG_TEMPLATE,
65    "u": SRE_FLAG_UNICODE,
66}
67
68TYPE_FLAGS = SRE_FLAG_ASCII | SRE_FLAG_LOCALE | SRE_FLAG_UNICODE
69GLOBAL_FLAGS = SRE_FLAG_DEBUG | SRE_FLAG_TEMPLATE
70
71class Verbose(Exception):
72    pass
73
74class State:
75    # keeps track of state for parsing
76    def __init__(self):
77        self.flags = 0
78        self.groupdict = {}
79        self.groupwidths = [None]  # group 0
80        self.lookbehindgroups = None
81    @property
82    def groups(self):
83        return len(self.groupwidths)
84    def opengroup(self, name=None):
85        gid = self.groups
86        self.groupwidths.append(None)
87        if self.groups > MAXGROUPS:
88            raise error("too many groups")
89        if name is not None:
90            ogid = self.groupdict.get(name, None)
91            if ogid is not None:
92                raise error("redefinition of group name %r as group %d; "
93                            "was group %d" % (name, gid,  ogid))
94            self.groupdict[name] = gid
95        return gid
96    def closegroup(self, gid, p):
97        self.groupwidths[gid] = p.getwidth()
98    def checkgroup(self, gid):
99        return gid < self.groups and self.groupwidths[gid] is not None
100
101    def checklookbehindgroup(self, gid, source):
102        if self.lookbehindgroups is not None:
103            if not self.checkgroup(gid):
104                raise source.error('cannot refer to an open group')
105            if gid >= self.lookbehindgroups:
106                raise source.error('cannot refer to group defined in the same '
107                                   'lookbehind subpattern')
108
109class SubPattern:
110    # a subpattern, in intermediate form
111    def __init__(self, state, data=None):
112        self.state = state
113        if data is None:
114            data = []
115        self.data = data
116        self.width = None
117
118    def dump(self, level=0):
119        nl = True
120        seqtypes = (tuple, list)
121        for op, av in self.data:
122            print(level*"  " + str(op), end='')
123            if op is IN:
124                # member sublanguage
125                print()
126                for op, a in av:
127                    print((level+1)*"  " + str(op), a)
128            elif op is BRANCH:
129                print()
130                for i, a in enumerate(av[1]):
131                    if i:
132                        print(level*"  " + "OR")
133                    a.dump(level+1)
134            elif op is GROUPREF_EXISTS:
135                condgroup, item_yes, item_no = av
136                print('', condgroup)
137                item_yes.dump(level+1)
138                if item_no:
139                    print(level*"  " + "ELSE")
140                    item_no.dump(level+1)
141            elif isinstance(av, seqtypes):
142                nl = False
143                for a in av:
144                    if isinstance(a, SubPattern):
145                        if not nl:
146                            print()
147                        a.dump(level+1)
148                        nl = True
149                    else:
150                        if not nl:
151                            print(' ', end='')
152                        print(a, end='')
153                        nl = False
154                if not nl:
155                    print()
156            else:
157                print('', av)
158    def __repr__(self):
159        return repr(self.data)
160    def __len__(self):
161        return len(self.data)
162    def __delitem__(self, index):
163        del self.data[index]
164    def __getitem__(self, index):
165        if isinstance(index, slice):
166            return SubPattern(self.state, self.data[index])
167        return self.data[index]
168    def __setitem__(self, index, code):
169        self.data[index] = code
170    def insert(self, index, code):
171        self.data.insert(index, code)
172    def append(self, code):
173        self.data.append(code)
174    def getwidth(self):
175        # determine the width (min, max) for this subpattern
176        if self.width is not None:
177            return self.width
178        lo = hi = 0
179        for op, av in self.data:
180            if op is BRANCH:
181                i = MAXREPEAT - 1
182                j = 0
183                for av in av[1]:
184                    l, h = av.getwidth()
185                    i = min(i, l)
186                    j = max(j, h)
187                lo = lo + i
188                hi = hi + j
189            elif op is CALL:
190                i, j = av.getwidth()
191                lo = lo + i
192                hi = hi + j
193            elif op is SUBPATTERN:
194                i, j = av[-1].getwidth()
195                lo = lo + i
196                hi = hi + j
197            elif op in _REPEATCODES:
198                i, j = av[2].getwidth()
199                lo = lo + i * av[0]
200                hi = hi + j * av[1]
201            elif op in _UNITCODES:
202                lo = lo + 1
203                hi = hi + 1
204            elif op is GROUPREF:
205                i, j = self.state.groupwidths[av]
206                lo = lo + i
207                hi = hi + j
208            elif op is GROUPREF_EXISTS:
209                i, j = av[1].getwidth()
210                if av[2] is not None:
211                    l, h = av[2].getwidth()
212                    i = min(i, l)
213                    j = max(j, h)
214                else:
215                    i = 0
216                lo = lo + i
217                hi = hi + j
218            elif op is SUCCESS:
219                break
220        self.width = min(lo, MAXREPEAT - 1), min(hi, MAXREPEAT)
221        return self.width
222
223class Tokenizer:
224    def __init__(self, string):
225        self.istext = isinstance(string, str)
226        self.string = string
227        if not self.istext:
228            string = str(string, 'latin1')
229        self.decoded_string = string
230        self.index = 0
231        self.next = None
232        self.__next()
233    def __next(self):
234        index = self.index
235        try:
236            char = self.decoded_string[index]
237        except IndexError:
238            self.next = None
239            return
240        if char == "\\":
241            index += 1
242            try:
243                char += self.decoded_string[index]
244            except IndexError:
245                raise error("bad escape (end of pattern)",
246                            self.string, len(self.string) - 1) from None
247        self.index = index + 1
248        self.next = char
249    def match(self, char):
250        if char == self.next:
251            self.__next()
252            return True
253        return False
254    def get(self):
255        this = self.next
256        self.__next()
257        return this
258    def getwhile(self, n, charset):
259        result = ''
260        for _ in range(n):
261            c = self.next
262            if c not in charset:
263                break
264            result += c
265            self.__next()
266        return result
267    def getuntil(self, terminator, name):
268        result = ''
269        while True:
270            c = self.next
271            self.__next()
272            if c is None:
273                if not result:
274                    raise self.error("missing " + name)
275                raise self.error("missing %s, unterminated name" % terminator,
276                                 len(result))
277            if c == terminator:
278                if not result:
279                    raise self.error("missing " + name, 1)
280                break
281            result += c
282        return result
283    @property
284    def pos(self):
285        return self.index - len(self.next or '')
286    def tell(self):
287        return self.index - len(self.next or '')
288    def seek(self, index):
289        self.index = index
290        self.__next()
291
292    def error(self, msg, offset=0):
293        return error(msg, self.string, self.tell() - offset)
294
295def _class_escape(source, escape):
296    # handle escape code inside character class
297    code = ESCAPES.get(escape)
298    if code:
299        return code
300    code = CATEGORIES.get(escape)
301    if code and code[0] is IN:
302        return code
303    try:
304        c = escape[1:2]
305        if c == "x":
306            # hexadecimal escape (exactly two digits)
307            escape += source.getwhile(2, HEXDIGITS)
308            if len(escape) != 4:
309                raise source.error("incomplete escape %s" % escape, len(escape))
310            return LITERAL, int(escape[2:], 16)
311        elif c == "u" and source.istext:
312            # unicode escape (exactly four digits)
313            escape += source.getwhile(4, HEXDIGITS)
314            if len(escape) != 6:
315                raise source.error("incomplete escape %s" % escape, len(escape))
316            return LITERAL, int(escape[2:], 16)
317        elif c == "U" and source.istext:
318            # unicode escape (exactly eight digits)
319            escape += source.getwhile(8, HEXDIGITS)
320            if len(escape) != 10:
321                raise source.error("incomplete escape %s" % escape, len(escape))
322            c = int(escape[2:], 16)
323            chr(c) # raise ValueError for invalid code
324            return LITERAL, c
325        elif c == "N" and source.istext:
326            import unicodedata
327            # named unicode escape e.g. \N{EM DASH}
328            if not source.match('{'):
329                raise source.error("missing {")
330            charname = source.getuntil('}', 'character name')
331            try:
332                c = ord(unicodedata.lookup(charname))
333            except KeyError:
334                raise source.error("undefined character name %r" % charname,
335                                   len(charname) + len(r'\N{}'))
336            return LITERAL, c
337        elif c in OCTDIGITS:
338            # octal escape (up to three digits)
339            escape += source.getwhile(2, OCTDIGITS)
340            c = int(escape[1:], 8)
341            if c > 0o377:
342                raise source.error('octal escape value %s outside of '
343                                   'range 0-0o377' % escape, len(escape))
344            return LITERAL, c
345        elif c in DIGITS:
346            raise ValueError
347        if len(escape) == 2:
348            if c in ASCIILETTERS:
349                raise source.error('bad escape %s' % escape, len(escape))
350            return LITERAL, ord(escape[1])
351    except ValueError:
352        pass
353    raise source.error("bad escape %s" % escape, len(escape))
354
355def _escape(source, escape, state):
356    # handle escape code in expression
357    code = CATEGORIES.get(escape)
358    if code:
359        return code
360    code = ESCAPES.get(escape)
361    if code:
362        return code
363    try:
364        c = escape[1:2]
365        if c == "x":
366            # hexadecimal escape
367            escape += source.getwhile(2, HEXDIGITS)
368            if len(escape) != 4:
369                raise source.error("incomplete escape %s" % escape, len(escape))
370            return LITERAL, int(escape[2:], 16)
371        elif c == "u" and source.istext:
372            # unicode escape (exactly four digits)
373            escape += source.getwhile(4, HEXDIGITS)
374            if len(escape) != 6:
375                raise source.error("incomplete escape %s" % escape, len(escape))
376            return LITERAL, int(escape[2:], 16)
377        elif c == "U" and source.istext:
378            # unicode escape (exactly eight digits)
379            escape += source.getwhile(8, HEXDIGITS)
380            if len(escape) != 10:
381                raise source.error("incomplete escape %s" % escape, len(escape))
382            c = int(escape[2:], 16)
383            chr(c) # raise ValueError for invalid code
384            return LITERAL, c
385        elif c == "N" and source.istext:
386            import unicodedata
387            # named unicode escape e.g. \N{EM DASH}
388            if not source.match('{'):
389                raise source.error("missing {")
390            charname = source.getuntil('}', 'character name')
391            try:
392                c = ord(unicodedata.lookup(charname))
393            except KeyError:
394                raise source.error("undefined character name %r" % charname,
395                                   len(charname) + len(r'\N{}'))
396            return LITERAL, c
397        elif c == "0":
398            # octal escape
399            escape += source.getwhile(2, OCTDIGITS)
400            return LITERAL, int(escape[1:], 8)
401        elif c in DIGITS:
402            # octal escape *or* decimal group reference (sigh)
403            if source.next in DIGITS:
404                escape += source.get()
405                if (escape[1] in OCTDIGITS and escape[2] in OCTDIGITS and
406                    source.next in OCTDIGITS):
407                    # got three octal digits; this is an octal escape
408                    escape += source.get()
409                    c = int(escape[1:], 8)
410                    if c > 0o377:
411                        raise source.error('octal escape value %s outside of '
412                                           'range 0-0o377' % escape,
413                                           len(escape))
414                    return LITERAL, c
415            # not an octal escape, so this is a group reference
416            group = int(escape[1:])
417            if group < state.groups:
418                if not state.checkgroup(group):
419                    raise source.error("cannot refer to an open group",
420                                       len(escape))
421                state.checklookbehindgroup(group, source)
422                return GROUPREF, group
423            raise source.error("invalid group reference %d" % group, len(escape) - 1)
424        if len(escape) == 2:
425            if c in ASCIILETTERS:
426                raise source.error("bad escape %s" % escape, len(escape))
427            return LITERAL, ord(escape[1])
428    except ValueError:
429        pass
430    raise source.error("bad escape %s" % escape, len(escape))
431
432def _uniq(items):
433    return list(dict.fromkeys(items))
434
435def _parse_sub(source, state, verbose, nested):
436    # parse an alternation: a|b|c
437
438    items = []
439    itemsappend = items.append
440    sourcematch = source.match
441    start = source.tell()
442    while True:
443        itemsappend(_parse(source, state, verbose, nested + 1,
444                           not nested and not items))
445        if not sourcematch("|"):
446            break
447
448    if len(items) == 1:
449        return items[0]
450
451    subpattern = SubPattern(state)
452
453    # check if all items share a common prefix
454    while True:
455        prefix = None
456        for item in items:
457            if not item:
458                break
459            if prefix is None:
460                prefix = item[0]
461            elif item[0] != prefix:
462                break
463        else:
464            # all subitems start with a common "prefix".
465            # move it out of the branch
466            for item in items:
467                del item[0]
468            subpattern.append(prefix)
469            continue # check next one
470        break
471
472    # check if the branch can be replaced by a character set
473    set = []
474    for item in items:
475        if len(item) != 1:
476            break
477        op, av = item[0]
478        if op is LITERAL:
479            set.append((op, av))
480        elif op is IN and av[0][0] is not NEGATE:
481            set.extend(av)
482        else:
483            break
484    else:
485        # we can store this as a character set instead of a
486        # branch (the compiler may optimize this even more)
487        subpattern.append((IN, _uniq(set)))
488        return subpattern
489
490    subpattern.append((BRANCH, (None, items)))
491    return subpattern
492
493def _parse(source, state, verbose, nested, first=False):
494    # parse a simple pattern
495    subpattern = SubPattern(state)
496
497    # precompute constants into local variables
498    subpatternappend = subpattern.append
499    sourceget = source.get
500    sourcematch = source.match
501    _len = len
502    _ord = ord
503
504    while True:
505
506        this = source.next
507        if this is None:
508            break # end of pattern
509        if this in "|)":
510            break # end of subpattern
511        sourceget()
512
513        if verbose:
514            # skip whitespace and comments
515            if this in WHITESPACE:
516                continue
517            if this == "#":
518                while True:
519                    this = sourceget()
520                    if this is None or this == "\n":
521                        break
522                continue
523
524        if this[0] == "\\":
525            code = _escape(source, this, state)
526            subpatternappend(code)
527
528        elif this not in SPECIAL_CHARS:
529            subpatternappend((LITERAL, _ord(this)))
530
531        elif this == "[":
532            here = source.tell() - 1
533            # character set
534            set = []
535            setappend = set.append
536##          if sourcematch(":"):
537##              pass # handle character classes
538            if source.next == '[':
539                import warnings
540                warnings.warn(
541                    'Possible nested set at position %d' % source.tell(),
542                    FutureWarning, stacklevel=nested + 6
543                )
544            negate = sourcematch("^")
545            # check remaining characters
546            while True:
547                this = sourceget()
548                if this is None:
549                    raise source.error("unterminated character set",
550                                       source.tell() - here)
551                if this == "]" and set:
552                    break
553                elif this[0] == "\\":
554                    code1 = _class_escape(source, this)
555                else:
556                    if set and this in '-&~|' and source.next == this:
557                        import warnings
558                        warnings.warn(
559                            'Possible set %s at position %d' % (
560                                'difference' if this == '-' else
561                                'intersection' if this == '&' else
562                                'symmetric difference' if this == '~' else
563                                'union',
564                                source.tell() - 1),
565                            FutureWarning, stacklevel=nested + 6
566                        )
567                    code1 = LITERAL, _ord(this)
568                if sourcematch("-"):
569                    # potential range
570                    that = sourceget()
571                    if that is None:
572                        raise source.error("unterminated character set",
573                                           source.tell() - here)
574                    if that == "]":
575                        if code1[0] is IN:
576                            code1 = code1[1][0]
577                        setappend(code1)
578                        setappend((LITERAL, _ord("-")))
579                        break
580                    if that[0] == "\\":
581                        code2 = _class_escape(source, that)
582                    else:
583                        if that == '-':
584                            import warnings
585                            warnings.warn(
586                                'Possible set difference at position %d' % (
587                                    source.tell() - 2),
588                                FutureWarning, stacklevel=nested + 6
589                            )
590                        code2 = LITERAL, _ord(that)
591                    if code1[0] != LITERAL or code2[0] != LITERAL:
592                        msg = "bad character range %s-%s" % (this, that)
593                        raise source.error(msg, len(this) + 1 + len(that))
594                    lo = code1[1]
595                    hi = code2[1]
596                    if hi < lo:
597                        msg = "bad character range %s-%s" % (this, that)
598                        raise source.error(msg, len(this) + 1 + len(that))
599                    setappend((RANGE, (lo, hi)))
600                else:
601                    if code1[0] is IN:
602                        code1 = code1[1][0]
603                    setappend(code1)
604
605            set = _uniq(set)
606            # XXX: <fl> should move set optimization to compiler!
607            if _len(set) == 1 and set[0][0] is LITERAL:
608                # optimization
609                if negate:
610                    subpatternappend((NOT_LITERAL, set[0][1]))
611                else:
612                    subpatternappend(set[0])
613            else:
614                if negate:
615                    set.insert(0, (NEGATE, None))
616                # charmap optimization can't be added here because
617                # global flags still are not known
618                subpatternappend((IN, set))
619
620        elif this in REPEAT_CHARS:
621            # repeat previous item
622            here = source.tell()
623            if this == "?":
624                min, max = 0, 1
625            elif this == "*":
626                min, max = 0, MAXREPEAT
627
628            elif this == "+":
629                min, max = 1, MAXREPEAT
630            elif this == "{":
631                if source.next == "}":
632                    subpatternappend((LITERAL, _ord(this)))
633                    continue
634
635                min, max = 0, MAXREPEAT
636                lo = hi = ""
637                while source.next in DIGITS:
638                    lo += sourceget()
639                if sourcematch(","):
640                    while source.next in DIGITS:
641                        hi += sourceget()
642                else:
643                    hi = lo
644                if not sourcematch("}"):
645                    subpatternappend((LITERAL, _ord(this)))
646                    source.seek(here)
647                    continue
648
649                if lo:
650                    min = int(lo)
651                    if min >= MAXREPEAT:
652                        raise OverflowError("the repetition number is too large")
653                if hi:
654                    max = int(hi)
655                    if max >= MAXREPEAT:
656                        raise OverflowError("the repetition number is too large")
657                    if max < min:
658                        raise source.error("min repeat greater than max repeat",
659                                           source.tell() - here)
660            else:
661                raise AssertionError("unsupported quantifier %r" % (char,))
662            # figure out which item to repeat
663            if subpattern:
664                item = subpattern[-1:]
665            else:
666                item = None
667            if not item or item[0][0] is AT:
668                raise source.error("nothing to repeat",
669                                   source.tell() - here + len(this))
670            if item[0][0] in _REPEATCODES:
671                raise source.error("multiple repeat",
672                                   source.tell() - here + len(this))
673            if item[0][0] is SUBPATTERN:
674                group, add_flags, del_flags, p = item[0][1]
675                if group is None and not add_flags and not del_flags:
676                    item = p
677            if sourcematch("?"):
678                subpattern[-1] = (MIN_REPEAT, (min, max, item))
679            else:
680                subpattern[-1] = (MAX_REPEAT, (min, max, item))
681
682        elif this == ".":
683            subpatternappend((ANY, None))
684
685        elif this == "(":
686            start = source.tell() - 1
687            group = True
688            name = None
689            add_flags = 0
690            del_flags = 0
691            if sourcematch("?"):
692                # options
693                char = sourceget()
694                if char is None:
695                    raise source.error("unexpected end of pattern")
696                if char == "P":
697                    # python extensions
698                    if sourcematch("<"):
699                        # named group: skip forward to end of name
700                        name = source.getuntil(">", "group name")
701                        if not name.isidentifier():
702                            msg = "bad character in group name %r" % name
703                            raise source.error(msg, len(name) + 1)
704                    elif sourcematch("="):
705                        # named backreference
706                        name = source.getuntil(")", "group name")
707                        if not name.isidentifier():
708                            msg = "bad character in group name %r" % name
709                            raise source.error(msg, len(name) + 1)
710                        gid = state.groupdict.get(name)
711                        if gid is None:
712                            msg = "unknown group name %r" % name
713                            raise source.error(msg, len(name) + 1)
714                        if not state.checkgroup(gid):
715                            raise source.error("cannot refer to an open group",
716                                               len(name) + 1)
717                        state.checklookbehindgroup(gid, source)
718                        subpatternappend((GROUPREF, gid))
719                        continue
720
721                    else:
722                        char = sourceget()
723                        if char is None:
724                            raise source.error("unexpected end of pattern")
725                        raise source.error("unknown extension ?P" + char,
726                                           len(char) + 2)
727                elif char == ":":
728                    # non-capturing group
729                    group = None
730                elif char == "#":
731                    # comment
732                    while True:
733                        if source.next is None:
734                            raise source.error("missing ), unterminated comment",
735                                               source.tell() - start)
736                        if sourceget() == ")":
737                            break
738                    continue
739
740                elif char in "=!<":
741                    # lookahead assertions
742                    dir = 1
743                    if char == "<":
744                        char = sourceget()
745                        if char is None:
746                            raise source.error("unexpected end of pattern")
747                        if char not in "=!":
748                            raise source.error("unknown extension ?<" + char,
749                                               len(char) + 2)
750                        dir = -1 # lookbehind
751                        lookbehindgroups = state.lookbehindgroups
752                        if lookbehindgroups is None:
753                            state.lookbehindgroups = state.groups
754                    p = _parse_sub(source, state, verbose, nested + 1)
755                    if dir < 0:
756                        if lookbehindgroups is None:
757                            state.lookbehindgroups = None
758                    if not sourcematch(")"):
759                        raise source.error("missing ), unterminated subpattern",
760                                           source.tell() - start)
761                    if char == "=":
762                        subpatternappend((ASSERT, (dir, p)))
763                    else:
764                        subpatternappend((ASSERT_NOT, (dir, p)))
765                    continue
766
767                elif char == "(":
768                    # conditional backreference group
769                    condname = source.getuntil(")", "group name")
770                    if condname.isidentifier():
771                        condgroup = state.groupdict.get(condname)
772                        if condgroup is None:
773                            msg = "unknown group name %r" % condname
774                            raise source.error(msg, len(condname) + 1)
775                    else:
776                        try:
777                            condgroup = int(condname)
778                            if condgroup < 0:
779                                raise ValueError
780                        except ValueError:
781                            msg = "bad character in group name %r" % condname
782                            raise source.error(msg, len(condname) + 1) from None
783                        if not condgroup:
784                            raise source.error("bad group number",
785                                               len(condname) + 1)
786                        if condgroup >= MAXGROUPS:
787                            msg = "invalid group reference %d" % condgroup
788                            raise source.error(msg, len(condname) + 1)
789                    state.checklookbehindgroup(condgroup, source)
790                    item_yes = _parse(source, state, verbose, nested + 1)
791                    if source.match("|"):
792                        item_no = _parse(source, state, verbose, nested + 1)
793                        if source.next == "|":
794                            raise source.error("conditional backref with more than two branches")
795                    else:
796                        item_no = None
797                    if not source.match(")"):
798                        raise source.error("missing ), unterminated subpattern",
799                                           source.tell() - start)
800                    subpatternappend((GROUPREF_EXISTS, (condgroup, item_yes, item_no)))
801                    continue
802
803                elif char in FLAGS or char == "-":
804                    # flags
805                    flags = _parse_flags(source, state, char)
806                    if flags is None:  # global flags
807                        if not first or subpattern:
808                            import warnings
809                            warnings.warn(
810                                'Flags not at the start of the expression %r%s' % (
811                                    source.string[:20],  # truncate long regexes
812                                    ' (truncated)' if len(source.string) > 20 else '',
813                                ),
814                                DeprecationWarning, stacklevel=nested + 6
815                            )
816                        if (state.flags & SRE_FLAG_VERBOSE) and not verbose:
817                            raise Verbose
818                        continue
819
820                    add_flags, del_flags = flags
821                    group = None
822                else:
823                    raise source.error("unknown extension ?" + char,
824                                       len(char) + 1)
825
826            # parse group contents
827            if group is not None:
828                try:
829                    group = state.opengroup(name)
830                except error as err:
831                    raise source.error(err.msg, len(name) + 1) from None
832            sub_verbose = ((verbose or (add_flags & SRE_FLAG_VERBOSE)) and
833                           not (del_flags & SRE_FLAG_VERBOSE))
834            p = _parse_sub(source, state, sub_verbose, nested + 1)
835            if not source.match(")"):
836                raise source.error("missing ), unterminated subpattern",
837                                   source.tell() - start)
838            if group is not None:
839                state.closegroup(group, p)
840            subpatternappend((SUBPATTERN, (group, add_flags, del_flags, p)))
841
842        elif this == "^":
843            subpatternappend((AT, AT_BEGINNING))
844
845        elif this == "$":
846            subpatternappend((AT, AT_END))
847
848        else:
849            raise AssertionError("unsupported special character %r" % (char,))
850
851    # unpack non-capturing groups
852    for i in range(len(subpattern))[::-1]:
853        op, av = subpattern[i]
854        if op is SUBPATTERN:
855            group, add_flags, del_flags, p = av
856            if group is None and not add_flags and not del_flags:
857                subpattern[i: i+1] = p
858
859    return subpattern
860
861def _parse_flags(source, state, char):
862    sourceget = source.get
863    add_flags = 0
864    del_flags = 0
865    if char != "-":
866        while True:
867            flag = FLAGS[char]
868            if source.istext:
869                if char == 'L':
870                    msg = "bad inline flags: cannot use 'L' flag with a str pattern"
871                    raise source.error(msg)
872            else:
873                if char == 'u':
874                    msg = "bad inline flags: cannot use 'u' flag with a bytes pattern"
875                    raise source.error(msg)
876            add_flags |= flag
877            if (flag & TYPE_FLAGS) and (add_flags & TYPE_FLAGS) != flag:
878                msg = "bad inline flags: flags 'a', 'u' and 'L' are incompatible"
879                raise source.error(msg)
880            char = sourceget()
881            if char is None:
882                raise source.error("missing -, : or )")
883            if char in ")-:":
884                break
885            if char not in FLAGS:
886                msg = "unknown flag" if char.isalpha() else "missing -, : or )"
887                raise source.error(msg, len(char))
888    if char == ")":
889        state.flags |= add_flags
890        return None
891    if add_flags & GLOBAL_FLAGS:
892        raise source.error("bad inline flags: cannot turn on global flag", 1)
893    if char == "-":
894        char = sourceget()
895        if char is None:
896            raise source.error("missing flag")
897        if char not in FLAGS:
898            msg = "unknown flag" if char.isalpha() else "missing flag"
899            raise source.error(msg, len(char))
900        while True:
901            flag = FLAGS[char]
902            if flag & TYPE_FLAGS:
903                msg = "bad inline flags: cannot turn off flags 'a', 'u' and 'L'"
904                raise source.error(msg)
905            del_flags |= flag
906            char = sourceget()
907            if char is None:
908                raise source.error("missing :")
909            if char == ":":
910                break
911            if char not in FLAGS:
912                msg = "unknown flag" if char.isalpha() else "missing :"
913                raise source.error(msg, len(char))
914    assert char == ":"
915    if del_flags & GLOBAL_FLAGS:
916        raise source.error("bad inline flags: cannot turn off global flag", 1)
917    if add_flags & del_flags:
918        raise source.error("bad inline flags: flag turned on and off", 1)
919    return add_flags, del_flags
920
921def fix_flags(src, flags):
922    # Check and fix flags according to the type of pattern (str or bytes)
923    if isinstance(src, str):
924        if flags & SRE_FLAG_LOCALE:
925            raise ValueError("cannot use LOCALE flag with a str pattern")
926        if not flags & SRE_FLAG_ASCII:
927            flags |= SRE_FLAG_UNICODE
928        elif flags & SRE_FLAG_UNICODE:
929            raise ValueError("ASCII and UNICODE flags are incompatible")
930    else:
931        if flags & SRE_FLAG_UNICODE:
932            raise ValueError("cannot use UNICODE flag with a bytes pattern")
933        if flags & SRE_FLAG_LOCALE and flags & SRE_FLAG_ASCII:
934            raise ValueError("ASCII and LOCALE flags are incompatible")
935    return flags
936
937def parse(str, flags=0, state=None):
938    # parse 're' pattern into list of (opcode, argument) tuples
939
940    source = Tokenizer(str)
941
942    if state is None:
943        state = State()
944    state.flags = flags
945    state.str = str
946
947    try:
948        p = _parse_sub(source, state, flags & SRE_FLAG_VERBOSE, 0)
949    except Verbose:
950        # the VERBOSE flag was switched on inside the pattern.  to be
951        # on the safe side, we'll parse the whole thing again...
952        state = State()
953        state.flags = flags | SRE_FLAG_VERBOSE
954        state.str = str
955        source.seek(0)
956        p = _parse_sub(source, state, True, 0)
957
958    p.state.flags = fix_flags(str, p.state.flags)
959
960    if source.next is not None:
961        assert source.next == ")"
962        raise source.error("unbalanced parenthesis")
963
964    if flags & SRE_FLAG_DEBUG:
965        p.dump()
966
967    return p
968
969def parse_template(source, state):
970    # parse 're' replacement string into list of literals and
971    # group references
972    s = Tokenizer(source)
973    sget = s.get
974    groups = []
975    literals = []
976    literal = []
977    lappend = literal.append
978    def addgroup(index, pos):
979        if index > state.groups:
980            raise s.error("invalid group reference %d" % index, pos)
981        if literal:
982            literals.append(''.join(literal))
983            del literal[:]
984        groups.append((len(literals), index))
985        literals.append(None)
986    groupindex = state.groupindex
987    while True:
988        this = sget()
989        if this is None:
990            break # end of replacement string
991        if this[0] == "\\":
992            # group
993            c = this[1]
994            if c == "g":
995                name = ""
996                if not s.match("<"):
997                    raise s.error("missing <")
998                name = s.getuntil(">", "group name")
999                if name.isidentifier():
1000                    try:
1001                        index = groupindex[name]
1002                    except KeyError:
1003                        raise IndexError("unknown group name %r" % name)
1004                else:
1005                    try:
1006                        index = int(name)
1007                        if index < 0:
1008                            raise ValueError
1009                    except ValueError:
1010                        raise s.error("bad character in group name %r" % name,
1011                                      len(name) + 1) from None
1012                    if index >= MAXGROUPS:
1013                        raise s.error("invalid group reference %d" % index,
1014                                      len(name) + 1)
1015                addgroup(index, len(name) + 1)
1016            elif c == "0":
1017                if s.next in OCTDIGITS:
1018                    this += sget()
1019                    if s.next in OCTDIGITS:
1020                        this += sget()
1021                lappend(chr(int(this[1:], 8) & 0xff))
1022            elif c in DIGITS:
1023                isoctal = False
1024                if s.next in DIGITS:
1025                    this += sget()
1026                    if (c in OCTDIGITS and this[2] in OCTDIGITS and
1027                        s.next in OCTDIGITS):
1028                        this += sget()
1029                        isoctal = True
1030                        c = int(this[1:], 8)
1031                        if c > 0o377:
1032                            raise s.error('octal escape value %s outside of '
1033                                          'range 0-0o377' % this, len(this))
1034                        lappend(chr(c))
1035                if not isoctal:
1036                    addgroup(int(this[1:]), len(this) - 1)
1037            else:
1038                try:
1039                    this = chr(ESCAPES[this][1])
1040                except KeyError:
1041                    if c in ASCIILETTERS:
1042                        raise s.error('bad escape %s' % this, len(this))
1043                lappend(this)
1044        else:
1045            lappend(this)
1046    if literal:
1047        literals.append(''.join(literal))
1048    if not isinstance(source, str):
1049        # The tokenizer implicitly decodes bytes objects as latin-1, we must
1050        # therefore re-encode the final representation.
1051        literals = [None if s is None else s.encode('latin-1') for s in literals]
1052    return groups, literals
1053
1054def expand_template(template, match):
1055    g = match.group
1056    empty = match.string[:0]
1057    groups, literals = template
1058    literals = literals[:]
1059    try:
1060        for index, group in groups:
1061            literals[index] = g(group) or empty
1062    except IndexError:
1063        raise error("invalid group reference %d" % index)
1064    return empty.join(literals)
1065