• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#
2# Secret Labs' Regular Expression Engine
3#
4# convert re-style regular expression to sre pattern
5#
6# Copyright (c) 1998-2001 by Secret Labs AB.  All rights reserved.
7#
8# See the sre.py file for information on usage and redistribution.
9#
10
11"""Internal support module for sre"""
12
13# XXX: show string offset and offending character for all errors
14
15from sre_constants import *
16
17SPECIAL_CHARS = ".\\[{()*+?^$|"
18REPEAT_CHARS = "*+?{"
19
20DIGITS = frozenset("0123456789")
21
22OCTDIGITS = frozenset("01234567")
23HEXDIGITS = frozenset("0123456789abcdefABCDEF")
24ASCIILETTERS = frozenset("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ")
25
26WHITESPACE = frozenset(" \t\n\r\v\f")
27
28_REPEATCODES = frozenset({MIN_REPEAT, MAX_REPEAT})
29_UNITCODES = frozenset({ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY})
30
31ESCAPES = {
32    r"\a": (LITERAL, ord("\a")),
33    r"\b": (LITERAL, ord("\b")),
34    r"\f": (LITERAL, ord("\f")),
35    r"\n": (LITERAL, ord("\n")),
36    r"\r": (LITERAL, ord("\r")),
37    r"\t": (LITERAL, ord("\t")),
38    r"\v": (LITERAL, ord("\v")),
39    r"\\": (LITERAL, ord("\\"))
40}
41
42CATEGORIES = {
43    r"\A": (AT, AT_BEGINNING_STRING), # start of string
44    r"\b": (AT, AT_BOUNDARY),
45    r"\B": (AT, AT_NON_BOUNDARY),
46    r"\d": (IN, [(CATEGORY, CATEGORY_DIGIT)]),
47    r"\D": (IN, [(CATEGORY, CATEGORY_NOT_DIGIT)]),
48    r"\s": (IN, [(CATEGORY, CATEGORY_SPACE)]),
49    r"\S": (IN, [(CATEGORY, CATEGORY_NOT_SPACE)]),
50    r"\w": (IN, [(CATEGORY, CATEGORY_WORD)]),
51    r"\W": (IN, [(CATEGORY, CATEGORY_NOT_WORD)]),
52    r"\Z": (AT, AT_END_STRING), # end of string
53}
54
55FLAGS = {
56    # standard flags
57    "i": SRE_FLAG_IGNORECASE,
58    "L": SRE_FLAG_LOCALE,
59    "m": SRE_FLAG_MULTILINE,
60    "s": SRE_FLAG_DOTALL,
61    "x": SRE_FLAG_VERBOSE,
62    # extensions
63    "a": SRE_FLAG_ASCII,
64    "t": SRE_FLAG_TEMPLATE,
65    "u": SRE_FLAG_UNICODE,
66}
67
68GLOBAL_FLAGS = (SRE_FLAG_ASCII | SRE_FLAG_LOCALE | SRE_FLAG_UNICODE |
69                SRE_FLAG_DEBUG | SRE_FLAG_TEMPLATE)
70
71class Verbose(Exception):
72    pass
73
74class Pattern:
75    # master pattern object.  keeps track of global attributes
76    def __init__(self):
77        self.flags = 0
78        self.groupdict = {}
79        self.groupwidths = [None]  # group 0
80        self.lookbehindgroups = None
81    @property
82    def groups(self):
83        return len(self.groupwidths)
84    def opengroup(self, name=None):
85        gid = self.groups
86        self.groupwidths.append(None)
87        if self.groups > MAXGROUPS:
88            raise error("too many groups")
89        if name is not None:
90            ogid = self.groupdict.get(name, None)
91            if ogid is not None:
92                raise error("redefinition of group name %r as group %d; "
93                            "was group %d" % (name, gid,  ogid))
94            self.groupdict[name] = gid
95        return gid
96    def closegroup(self, gid, p):
97        self.groupwidths[gid] = p.getwidth()
98    def checkgroup(self, gid):
99        return gid < self.groups and self.groupwidths[gid] is not None
100
101    def checklookbehindgroup(self, gid, source):
102        if self.lookbehindgroups is not None:
103            if not self.checkgroup(gid):
104                raise source.error('cannot refer to an open group')
105            if gid >= self.lookbehindgroups:
106                raise source.error('cannot refer to group defined in the same '
107                                   'lookbehind subpattern')
108
109class SubPattern:
110    # a subpattern, in intermediate form
111    def __init__(self, pattern, data=None):
112        self.pattern = pattern
113        if data is None:
114            data = []
115        self.data = data
116        self.width = None
117    def dump(self, level=0):
118        nl = True
119        seqtypes = (tuple, list)
120        for op, av in self.data:
121            print(level*"  " + str(op), end='')
122            if op is IN:
123                # member sublanguage
124                print()
125                for op, a in av:
126                    print((level+1)*"  " + str(op), a)
127            elif op is BRANCH:
128                print()
129                for i, a in enumerate(av[1]):
130                    if i:
131                        print(level*"  " + "OR")
132                    a.dump(level+1)
133            elif op is GROUPREF_EXISTS:
134                condgroup, item_yes, item_no = av
135                print('', condgroup)
136                item_yes.dump(level+1)
137                if item_no:
138                    print(level*"  " + "ELSE")
139                    item_no.dump(level+1)
140            elif isinstance(av, seqtypes):
141                nl = False
142                for a in av:
143                    if isinstance(a, SubPattern):
144                        if not nl:
145                            print()
146                        a.dump(level+1)
147                        nl = True
148                    else:
149                        if not nl:
150                            print(' ', end='')
151                        print(a, end='')
152                        nl = False
153                if not nl:
154                    print()
155            else:
156                print('', av)
157    def __repr__(self):
158        return repr(self.data)
159    def __len__(self):
160        return len(self.data)
161    def __delitem__(self, index):
162        del self.data[index]
163    def __getitem__(self, index):
164        if isinstance(index, slice):
165            return SubPattern(self.pattern, self.data[index])
166        return self.data[index]
167    def __setitem__(self, index, code):
168        self.data[index] = code
169    def insert(self, index, code):
170        self.data.insert(index, code)
171    def append(self, code):
172        self.data.append(code)
173    def getwidth(self):
174        # determine the width (min, max) for this subpattern
175        if self.width is not None:
176            return self.width
177        lo = hi = 0
178        for op, av in self.data:
179            if op is BRANCH:
180                i = MAXREPEAT - 1
181                j = 0
182                for av in av[1]:
183                    l, h = av.getwidth()
184                    i = min(i, l)
185                    j = max(j, h)
186                lo = lo + i
187                hi = hi + j
188            elif op is CALL:
189                i, j = av.getwidth()
190                lo = lo + i
191                hi = hi + j
192            elif op is SUBPATTERN:
193                i, j = av[-1].getwidth()
194                lo = lo + i
195                hi = hi + j
196            elif op in _REPEATCODES:
197                i, j = av[2].getwidth()
198                lo = lo + i * av[0]
199                hi = hi + j * av[1]
200            elif op in _UNITCODES:
201                lo = lo + 1
202                hi = hi + 1
203            elif op is GROUPREF:
204                i, j = self.pattern.groupwidths[av]
205                lo = lo + i
206                hi = hi + j
207            elif op is GROUPREF_EXISTS:
208                i, j = av[1].getwidth()
209                if av[2] is not None:
210                    l, h = av[2].getwidth()
211                    i = min(i, l)
212                    j = max(j, h)
213                else:
214                    i = 0
215                lo = lo + i
216                hi = hi + j
217            elif op is SUCCESS:
218                break
219        self.width = min(lo, MAXREPEAT - 1), min(hi, MAXREPEAT)
220        return self.width
221
222class Tokenizer:
223    def __init__(self, string):
224        self.istext = isinstance(string, str)
225        self.string = string
226        if not self.istext:
227            string = str(string, 'latin1')
228        self.decoded_string = string
229        self.index = 0
230        self.next = None
231        self.__next()
232    def __next(self):
233        index = self.index
234        try:
235            char = self.decoded_string[index]
236        except IndexError:
237            self.next = None
238            return
239        if char == "\\":
240            index += 1
241            try:
242                char += self.decoded_string[index]
243            except IndexError:
244                raise error("bad escape (end of pattern)",
245                            self.string, len(self.string) - 1) from None
246        self.index = index + 1
247        self.next = char
248    def match(self, char):
249        if char == self.next:
250            self.__next()
251            return True
252        return False
253    def get(self):
254        this = self.next
255        self.__next()
256        return this
257    def getwhile(self, n, charset):
258        result = ''
259        for _ in range(n):
260            c = self.next
261            if c not in charset:
262                break
263            result += c
264            self.__next()
265        return result
266    def getuntil(self, terminator):
267        result = ''
268        while True:
269            c = self.next
270            self.__next()
271            if c is None:
272                if not result:
273                    raise self.error("missing group name")
274                raise self.error("missing %s, unterminated name" % terminator,
275                                 len(result))
276            if c == terminator:
277                if not result:
278                    raise self.error("missing group name", 1)
279                break
280            result += c
281        return result
282    @property
283    def pos(self):
284        return self.index - len(self.next or '')
285    def tell(self):
286        return self.index - len(self.next or '')
287    def seek(self, index):
288        self.index = index
289        self.__next()
290
291    def error(self, msg, offset=0):
292        return error(msg, self.string, self.tell() - offset)
293
294def _class_escape(source, escape):
295    # handle escape code inside character class
296    code = ESCAPES.get(escape)
297    if code:
298        return code
299    code = CATEGORIES.get(escape)
300    if code and code[0] is IN:
301        return code
302    try:
303        c = escape[1:2]
304        if c == "x":
305            # hexadecimal escape (exactly two digits)
306            escape += source.getwhile(2, HEXDIGITS)
307            if len(escape) != 4:
308                raise source.error("incomplete escape %s" % escape, len(escape))
309            return LITERAL, int(escape[2:], 16)
310        elif c == "u" and source.istext:
311            # unicode escape (exactly four digits)
312            escape += source.getwhile(4, HEXDIGITS)
313            if len(escape) != 6:
314                raise source.error("incomplete escape %s" % escape, len(escape))
315            return LITERAL, int(escape[2:], 16)
316        elif c == "U" and source.istext:
317            # unicode escape (exactly eight digits)
318            escape += source.getwhile(8, HEXDIGITS)
319            if len(escape) != 10:
320                raise source.error("incomplete escape %s" % escape, len(escape))
321            c = int(escape[2:], 16)
322            chr(c) # raise ValueError for invalid code
323            return LITERAL, c
324        elif c in OCTDIGITS:
325            # octal escape (up to three digits)
326            escape += source.getwhile(2, OCTDIGITS)
327            c = int(escape[1:], 8)
328            if c > 0o377:
329                raise source.error('octal escape value %s outside of '
330                                   'range 0-0o377' % escape, len(escape))
331            return LITERAL, c
332        elif c in DIGITS:
333            raise ValueError
334        if len(escape) == 2:
335            if c in ASCIILETTERS:
336                raise source.error('bad escape %s' % escape, len(escape))
337            return LITERAL, ord(escape[1])
338    except ValueError:
339        pass
340    raise source.error("bad escape %s" % escape, len(escape))
341
342def _escape(source, escape, state):
343    # handle escape code in expression
344    code = CATEGORIES.get(escape)
345    if code:
346        return code
347    code = ESCAPES.get(escape)
348    if code:
349        return code
350    try:
351        c = escape[1:2]
352        if c == "x":
353            # hexadecimal escape
354            escape += source.getwhile(2, HEXDIGITS)
355            if len(escape) != 4:
356                raise source.error("incomplete escape %s" % escape, len(escape))
357            return LITERAL, int(escape[2:], 16)
358        elif c == "u" and source.istext:
359            # unicode escape (exactly four digits)
360            escape += source.getwhile(4, HEXDIGITS)
361            if len(escape) != 6:
362                raise source.error("incomplete escape %s" % escape, len(escape))
363            return LITERAL, int(escape[2:], 16)
364        elif c == "U" and source.istext:
365            # unicode escape (exactly eight digits)
366            escape += source.getwhile(8, HEXDIGITS)
367            if len(escape) != 10:
368                raise source.error("incomplete escape %s" % escape, len(escape))
369            c = int(escape[2:], 16)
370            chr(c) # raise ValueError for invalid code
371            return LITERAL, c
372        elif c == "0":
373            # octal escape
374            escape += source.getwhile(2, OCTDIGITS)
375            return LITERAL, int(escape[1:], 8)
376        elif c in DIGITS:
377            # octal escape *or* decimal group reference (sigh)
378            if source.next in DIGITS:
379                escape += source.get()
380                if (escape[1] in OCTDIGITS and escape[2] in OCTDIGITS and
381                    source.next in OCTDIGITS):
382                    # got three octal digits; this is an octal escape
383                    escape += source.get()
384                    c = int(escape[1:], 8)
385                    if c > 0o377:
386                        raise source.error('octal escape value %s outside of '
387                                           'range 0-0o377' % escape,
388                                           len(escape))
389                    return LITERAL, c
390            # not an octal escape, so this is a group reference
391            group = int(escape[1:])
392            if group < state.groups:
393                if not state.checkgroup(group):
394                    raise source.error("cannot refer to an open group",
395                                       len(escape))
396                state.checklookbehindgroup(group, source)
397                return GROUPREF, group
398            raise source.error("invalid group reference %d" % group, len(escape) - 1)
399        if len(escape) == 2:
400            if c in ASCIILETTERS:
401                raise source.error("bad escape %s" % escape, len(escape))
402            return LITERAL, ord(escape[1])
403    except ValueError:
404        pass
405    raise source.error("bad escape %s" % escape, len(escape))
406
407def _parse_sub(source, state, verbose, nested=True):
408    # parse an alternation: a|b|c
409
410    items = []
411    itemsappend = items.append
412    sourcematch = source.match
413    start = source.tell()
414    while True:
415        itemsappend(_parse(source, state, verbose))
416        if not sourcematch("|"):
417            break
418
419    if len(items) == 1:
420        return items[0]
421
422    subpattern = SubPattern(state)
423    subpatternappend = subpattern.append
424
425    # check if all items share a common prefix
426    while True:
427        prefix = None
428        for item in items:
429            if not item:
430                break
431            if prefix is None:
432                prefix = item[0]
433            elif item[0] != prefix:
434                break
435        else:
436            # all subitems start with a common "prefix".
437            # move it out of the branch
438            for item in items:
439                del item[0]
440            subpatternappend(prefix)
441            continue # check next one
442        break
443
444    # check if the branch can be replaced by a character set
445    for item in items:
446        if len(item) != 1 or item[0][0] is not LITERAL:
447            break
448    else:
449        # we can store this as a character set instead of a
450        # branch (the compiler may optimize this even more)
451        subpatternappend((IN, [item[0] for item in items]))
452        return subpattern
453
454    subpattern.append((BRANCH, (None, items)))
455    return subpattern
456
457def _parse_sub_cond(source, state, condgroup, verbose):
458    item_yes = _parse(source, state, verbose)
459    if source.match("|"):
460        item_no = _parse(source, state, verbose)
461        if source.next == "|":
462            raise source.error("conditional backref with more than two branches")
463    else:
464        item_no = None
465    subpattern = SubPattern(state)
466    subpattern.append((GROUPREF_EXISTS, (condgroup, item_yes, item_no)))
467    return subpattern
468
469def _parse(source, state, verbose):
470    # parse a simple pattern
471    subpattern = SubPattern(state)
472
473    # precompute constants into local variables
474    subpatternappend = subpattern.append
475    sourceget = source.get
476    sourcematch = source.match
477    _len = len
478    _ord = ord
479
480    while True:
481
482        this = source.next
483        if this is None:
484            break # end of pattern
485        if this in "|)":
486            break # end of subpattern
487        sourceget()
488
489        if verbose:
490            # skip whitespace and comments
491            if this in WHITESPACE:
492                continue
493            if this == "#":
494                while True:
495                    this = sourceget()
496                    if this is None or this == "\n":
497                        break
498                continue
499
500        if this[0] == "\\":
501            code = _escape(source, this, state)
502            subpatternappend(code)
503
504        elif this not in SPECIAL_CHARS:
505            subpatternappend((LITERAL, _ord(this)))
506
507        elif this == "[":
508            here = source.tell() - 1
509            # character set
510            set = []
511            setappend = set.append
512##          if sourcematch(":"):
513##              pass # handle character classes
514            if sourcematch("^"):
515                setappend((NEGATE, None))
516            # check remaining characters
517            start = set[:]
518            while True:
519                this = sourceget()
520                if this is None:
521                    raise source.error("unterminated character set",
522                                       source.tell() - here)
523                if this == "]" and set != start:
524                    break
525                elif this[0] == "\\":
526                    code1 = _class_escape(source, this)
527                else:
528                    code1 = LITERAL, _ord(this)
529                if sourcematch("-"):
530                    # potential range
531                    that = sourceget()
532                    if that is None:
533                        raise source.error("unterminated character set",
534                                           source.tell() - here)
535                    if that == "]":
536                        if code1[0] is IN:
537                            code1 = code1[1][0]
538                        setappend(code1)
539                        setappend((LITERAL, _ord("-")))
540                        break
541                    if that[0] == "\\":
542                        code2 = _class_escape(source, that)
543                    else:
544                        code2 = LITERAL, _ord(that)
545                    if code1[0] != LITERAL or code2[0] != LITERAL:
546                        msg = "bad character range %s-%s" % (this, that)
547                        raise source.error(msg, len(this) + 1 + len(that))
548                    lo = code1[1]
549                    hi = code2[1]
550                    if hi < lo:
551                        msg = "bad character range %s-%s" % (this, that)
552                        raise source.error(msg, len(this) + 1 + len(that))
553                    setappend((RANGE, (lo, hi)))
554                else:
555                    if code1[0] is IN:
556                        code1 = code1[1][0]
557                    setappend(code1)
558
559            # XXX: <fl> should move set optimization to compiler!
560            if _len(set)==1 and set[0][0] is LITERAL:
561                subpatternappend(set[0]) # optimization
562            elif _len(set)==2 and set[0][0] is NEGATE and set[1][0] is LITERAL:
563                subpatternappend((NOT_LITERAL, set[1][1])) # optimization
564            else:
565                # XXX: <fl> should add charmap optimization here
566                subpatternappend((IN, set))
567
568        elif this in REPEAT_CHARS:
569            # repeat previous item
570            here = source.tell()
571            if this == "?":
572                min, max = 0, 1
573            elif this == "*":
574                min, max = 0, MAXREPEAT
575
576            elif this == "+":
577                min, max = 1, MAXREPEAT
578            elif this == "{":
579                if source.next == "}":
580                    subpatternappend((LITERAL, _ord(this)))
581                    continue
582                min, max = 0, MAXREPEAT
583                lo = hi = ""
584                while source.next in DIGITS:
585                    lo += sourceget()
586                if sourcematch(","):
587                    while source.next in DIGITS:
588                        hi += sourceget()
589                else:
590                    hi = lo
591                if not sourcematch("}"):
592                    subpatternappend((LITERAL, _ord(this)))
593                    source.seek(here)
594                    continue
595                if lo:
596                    min = int(lo)
597                    if min >= MAXREPEAT:
598                        raise OverflowError("the repetition number is too large")
599                if hi:
600                    max = int(hi)
601                    if max >= MAXREPEAT:
602                        raise OverflowError("the repetition number is too large")
603                    if max < min:
604                        raise source.error("min repeat greater than max repeat",
605                                           source.tell() - here)
606            else:
607                raise AssertionError("unsupported quantifier %r" % (char,))
608            # figure out which item to repeat
609            if subpattern:
610                item = subpattern[-1:]
611            else:
612                item = None
613            if not item or (_len(item) == 1 and item[0][0] is AT):
614                raise source.error("nothing to repeat",
615                                   source.tell() - here + len(this))
616            if item[0][0] in _REPEATCODES:
617                raise source.error("multiple repeat",
618                                   source.tell() - here + len(this))
619            if sourcematch("?"):
620                subpattern[-1] = (MIN_REPEAT, (min, max, item))
621            else:
622                subpattern[-1] = (MAX_REPEAT, (min, max, item))
623
624        elif this == ".":
625            subpatternappend((ANY, None))
626
627        elif this == "(":
628            start = source.tell() - 1
629            group = True
630            name = None
631            condgroup = None
632            add_flags = 0
633            del_flags = 0
634            if sourcematch("?"):
635                # options
636                char = sourceget()
637                if char is None:
638                    raise source.error("unexpected end of pattern")
639                if char == "P":
640                    # python extensions
641                    if sourcematch("<"):
642                        # named group: skip forward to end of name
643                        name = source.getuntil(">")
644                        if not name.isidentifier():
645                            msg = "bad character in group name %r" % name
646                            raise source.error(msg, len(name) + 1)
647                    elif sourcematch("="):
648                        # named backreference
649                        name = source.getuntil(")")
650                        if not name.isidentifier():
651                            msg = "bad character in group name %r" % name
652                            raise source.error(msg, len(name) + 1)
653                        gid = state.groupdict.get(name)
654                        if gid is None:
655                            msg = "unknown group name %r" % name
656                            raise source.error(msg, len(name) + 1)
657                        if not state.checkgroup(gid):
658                            raise source.error("cannot refer to an open group",
659                                               len(name) + 1)
660                        state.checklookbehindgroup(gid, source)
661                        subpatternappend((GROUPREF, gid))
662                        continue
663                    else:
664                        char = sourceget()
665                        if char is None:
666                            raise source.error("unexpected end of pattern")
667                        raise source.error("unknown extension ?P" + char,
668                                           len(char) + 2)
669                elif char == ":":
670                    # non-capturing group
671                    group = None
672                elif char == "#":
673                    # comment
674                    while True:
675                        if source.next is None:
676                            raise source.error("missing ), unterminated comment",
677                                               source.tell() - start)
678                        if sourceget() == ")":
679                            break
680                    continue
681                elif char in "=!<":
682                    # lookahead assertions
683                    dir = 1
684                    if char == "<":
685                        char = sourceget()
686                        if char is None:
687                            raise source.error("unexpected end of pattern")
688                        if char not in "=!":
689                            raise source.error("unknown extension ?<" + char,
690                                               len(char) + 2)
691                        dir = -1 # lookbehind
692                        lookbehindgroups = state.lookbehindgroups
693                        if lookbehindgroups is None:
694                            state.lookbehindgroups = state.groups
695                    p = _parse_sub(source, state, verbose)
696                    if dir < 0:
697                        if lookbehindgroups is None:
698                            state.lookbehindgroups = None
699                    if not sourcematch(")"):
700                        raise source.error("missing ), unterminated subpattern",
701                                           source.tell() - start)
702                    if char == "=":
703                        subpatternappend((ASSERT, (dir, p)))
704                    else:
705                        subpatternappend((ASSERT_NOT, (dir, p)))
706                    continue
707                elif char == "(":
708                    # conditional backreference group
709                    condname = source.getuntil(")")
710                    group = None
711                    if condname.isidentifier():
712                        condgroup = state.groupdict.get(condname)
713                        if condgroup is None:
714                            msg = "unknown group name %r" % condname
715                            raise source.error(msg, len(condname) + 1)
716                    else:
717                        try:
718                            condgroup = int(condname)
719                            if condgroup < 0:
720                                raise ValueError
721                        except ValueError:
722                            msg = "bad character in group name %r" % condname
723                            raise source.error(msg, len(condname) + 1) from None
724                        if not condgroup:
725                            raise source.error("bad group number",
726                                               len(condname) + 1)
727                        if condgroup >= MAXGROUPS:
728                            msg = "invalid group reference %d" % condgroup
729                            raise source.error(msg, len(condname) + 1)
730                    state.checklookbehindgroup(condgroup, source)
731                elif char in FLAGS or char == "-":
732                    # flags
733                    pos = source.pos
734                    flags = _parse_flags(source, state, char)
735                    if flags is None:  # global flags
736                        if pos != 3:  # "(?x"
737                            import warnings
738                            warnings.warn(
739                                'Flags not at the start of the expression %s%s' % (
740                                    source.string[:20],  # truncate long regexes
741                                    ' (truncated)' if len(source.string) > 20 else '',
742                                ),
743                                DeprecationWarning, stacklevel=7
744                            )
745                        continue
746                    add_flags, del_flags = flags
747                    group = None
748                else:
749                    raise source.error("unknown extension ?" + char,
750                                       len(char) + 1)
751
752            # parse group contents
753            if group is not None:
754                try:
755                    group = state.opengroup(name)
756                except error as err:
757                    raise source.error(err.msg, len(name) + 1) from None
758            if condgroup:
759                p = _parse_sub_cond(source, state, condgroup, verbose)
760            else:
761                sub_verbose = ((verbose or (add_flags & SRE_FLAG_VERBOSE)) and
762                               not (del_flags & SRE_FLAG_VERBOSE))
763                p = _parse_sub(source, state, sub_verbose)
764            if not source.match(")"):
765                raise source.error("missing ), unterminated subpattern",
766                                   source.tell() - start)
767            if group is not None:
768                state.closegroup(group, p)
769            subpatternappend((SUBPATTERN, (group, add_flags, del_flags, p)))
770
771        elif this == "^":
772            subpatternappend((AT, AT_BEGINNING))
773
774        elif this == "$":
775            subpattern.append((AT, AT_END))
776
777        else:
778            raise AssertionError("unsupported special character %r" % (char,))
779
780    return subpattern
781
782def _parse_flags(source, state, char):
783    sourceget = source.get
784    add_flags = 0
785    del_flags = 0
786    if char != "-":
787        while True:
788            add_flags |= FLAGS[char]
789            char = sourceget()
790            if char is None:
791                raise source.error("missing -, : or )")
792            if char in ")-:":
793                break
794            if char not in FLAGS:
795                msg = "unknown flag" if char.isalpha() else "missing -, : or )"
796                raise source.error(msg, len(char))
797    if char == ")":
798        if ((add_flags & SRE_FLAG_VERBOSE) and
799            not (state.flags & SRE_FLAG_VERBOSE)):
800            raise Verbose
801        state.flags |= add_flags
802        return None
803    if add_flags & GLOBAL_FLAGS:
804        raise source.error("bad inline flags: cannot turn on global flag", 1)
805    if char == "-":
806        char = sourceget()
807        if char is None:
808            raise source.error("missing flag")
809        if char not in FLAGS:
810            msg = "unknown flag" if char.isalpha() else "missing flag"
811            raise source.error(msg, len(char))
812        while True:
813            del_flags |= FLAGS[char]
814            char = sourceget()
815            if char is None:
816                raise source.error("missing :")
817            if char == ":":
818                break
819            if char not in FLAGS:
820                msg = "unknown flag" if char.isalpha() else "missing :"
821                raise source.error(msg, len(char))
822    assert char == ":"
823    if del_flags & GLOBAL_FLAGS:
824        raise source.error("bad inline flags: cannot turn off global flag", 1)
825    if add_flags & del_flags:
826        raise source.error("bad inline flags: flag turned on and off", 1)
827    return add_flags, del_flags
828
829def fix_flags(src, flags):
830    # Check and fix flags according to the type of pattern (str or bytes)
831    if isinstance(src, str):
832        if flags & SRE_FLAG_LOCALE:
833            raise ValueError("cannot use LOCALE flag with a str pattern")
834        if not flags & SRE_FLAG_ASCII:
835            flags |= SRE_FLAG_UNICODE
836        elif flags & SRE_FLAG_UNICODE:
837            raise ValueError("ASCII and UNICODE flags are incompatible")
838    else:
839        if flags & SRE_FLAG_UNICODE:
840            raise ValueError("cannot use UNICODE flag with a bytes pattern")
841        if flags & SRE_FLAG_LOCALE and flags & SRE_FLAG_ASCII:
842            raise ValueError("ASCII and LOCALE flags are incompatible")
843    return flags
844
845def parse(str, flags=0, pattern=None):
846    # parse 're' pattern into list of (opcode, argument) tuples
847
848    source = Tokenizer(str)
849
850    if pattern is None:
851        pattern = Pattern()
852    pattern.flags = flags
853    pattern.str = str
854
855    try:
856        p = _parse_sub(source, pattern, flags & SRE_FLAG_VERBOSE, False)
857    except Verbose:
858        # the VERBOSE flag was switched on inside the pattern.  to be
859        # on the safe side, we'll parse the whole thing again...
860        pattern = Pattern()
861        pattern.flags = flags | SRE_FLAG_VERBOSE
862        pattern.str = str
863        source.seek(0)
864        p = _parse_sub(source, pattern, True, False)
865
866    p.pattern.flags = fix_flags(str, p.pattern.flags)
867
868    if source.next is not None:
869        assert source.next == ")"
870        raise source.error("unbalanced parenthesis")
871
872    if flags & SRE_FLAG_DEBUG:
873        p.dump()
874
875    return p
876
877def parse_template(source, pattern):
878    # parse 're' replacement string into list of literals and
879    # group references
880    s = Tokenizer(source)
881    sget = s.get
882    groups = []
883    literals = []
884    literal = []
885    lappend = literal.append
886    def addgroup(index, pos):
887        if index > pattern.groups:
888            raise s.error("invalid group reference %d" % index, pos)
889        if literal:
890            literals.append(''.join(literal))
891            del literal[:]
892        groups.append((len(literals), index))
893        literals.append(None)
894    groupindex = pattern.groupindex
895    while True:
896        this = sget()
897        if this is None:
898            break # end of replacement string
899        if this[0] == "\\":
900            # group
901            c = this[1]
902            if c == "g":
903                name = ""
904                if not s.match("<"):
905                    raise s.error("missing <")
906                name = s.getuntil(">")
907                if name.isidentifier():
908                    try:
909                        index = groupindex[name]
910                    except KeyError:
911                        raise IndexError("unknown group name %r" % name)
912                else:
913                    try:
914                        index = int(name)
915                        if index < 0:
916                            raise ValueError
917                    except ValueError:
918                        raise s.error("bad character in group name %r" % name,
919                                      len(name) + 1) from None
920                    if index >= MAXGROUPS:
921                        raise s.error("invalid group reference %d" % index,
922                                      len(name) + 1)
923                addgroup(index, len(name) + 1)
924            elif c == "0":
925                if s.next in OCTDIGITS:
926                    this += sget()
927                    if s.next in OCTDIGITS:
928                        this += sget()
929                lappend(chr(int(this[1:], 8) & 0xff))
930            elif c in DIGITS:
931                isoctal = False
932                if s.next in DIGITS:
933                    this += sget()
934                    if (c in OCTDIGITS and this[2] in OCTDIGITS and
935                        s.next in OCTDIGITS):
936                        this += sget()
937                        isoctal = True
938                        c = int(this[1:], 8)
939                        if c > 0o377:
940                            raise s.error('octal escape value %s outside of '
941                                          'range 0-0o377' % this, len(this))
942                        lappend(chr(c))
943                if not isoctal:
944                    addgroup(int(this[1:]), len(this) - 1)
945            else:
946                try:
947                    this = chr(ESCAPES[this][1])
948                except KeyError:
949                    if c in ASCIILETTERS:
950                        import warnings
951                        warnings.warn('bad escape %s' % this,
952                                      DeprecationWarning, stacklevel=4)
953                lappend(this)
954        else:
955            lappend(this)
956    if literal:
957        literals.append(''.join(literal))
958    if not isinstance(source, str):
959        # The tokenizer implicitly decodes bytes objects as latin-1, we must
960        # therefore re-encode the final representation.
961        literals = [None if s is None else s.encode('latin-1') for s in literals]
962    return groups, literals
963
964def expand_template(template, match):
965    g = match.group
966    empty = match.string[:0]
967    groups, literals = template
968    literals = literals[:]
969    try:
970        for index, group in groups:
971            literals[index] = g(group) or empty
972    except IndexError:
973        raise error("invalid group reference %d" % index)
974    return empty.join(literals)
975