• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 #
2 # Secret Labs' Regular Expression Engine
3 #
4 # convert re-style regular expression to sre pattern
5 #
6 # Copyright (c) 1998-2001 by Secret Labs AB.  All rights reserved.
7 #
8 # See the sre.py file for information on usage and redistribution.
9 #
10 
11 """Internal support module for sre"""
12 
13 # XXX: show string offset and offending character for all errors
14 
15 from sre_constants import *
16 
17 SPECIAL_CHARS = ".\\[{()*+?^$|"
18 REPEAT_CHARS = "*+?{"
19 
20 DIGITS = frozenset("0123456789")
21 
22 OCTDIGITS = frozenset("01234567")
23 HEXDIGITS = frozenset("0123456789abcdefABCDEF")
24 ASCIILETTERS = frozenset("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ")
25 
26 WHITESPACE = frozenset(" \t\n\r\v\f")
27 
28 _REPEATCODES = frozenset({MIN_REPEAT, MAX_REPEAT})
29 _UNITCODES = frozenset({ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY})
30 
31 ESCAPES = {
32     r"\a": (LITERAL, ord("\a")),
33     r"\b": (LITERAL, ord("\b")),
34     r"\f": (LITERAL, ord("\f")),
35     r"\n": (LITERAL, ord("\n")),
36     r"\r": (LITERAL, ord("\r")),
37     r"\t": (LITERAL, ord("\t")),
38     r"\v": (LITERAL, ord("\v")),
39     r"\\": (LITERAL, ord("\\"))
40 }
41 
42 CATEGORIES = {
43     r"\A": (AT, AT_BEGINNING_STRING), # start of string
44     r"\b": (AT, AT_BOUNDARY),
45     r"\B": (AT, AT_NON_BOUNDARY),
46     r"\d": (IN, [(CATEGORY, CATEGORY_DIGIT)]),
47     r"\D": (IN, [(CATEGORY, CATEGORY_NOT_DIGIT)]),
48     r"\s": (IN, [(CATEGORY, CATEGORY_SPACE)]),
49     r"\S": (IN, [(CATEGORY, CATEGORY_NOT_SPACE)]),
50     r"\w": (IN, [(CATEGORY, CATEGORY_WORD)]),
51     r"\W": (IN, [(CATEGORY, CATEGORY_NOT_WORD)]),
52     r"\Z": (AT, AT_END_STRING), # end of string
53 }
54 
55 FLAGS = {
56     # standard flags
57     "i": SRE_FLAG_IGNORECASE,
58     "L": SRE_FLAG_LOCALE,
59     "m": SRE_FLAG_MULTILINE,
60     "s": SRE_FLAG_DOTALL,
61     "x": SRE_FLAG_VERBOSE,
62     # extensions
63     "a": SRE_FLAG_ASCII,
64     "t": SRE_FLAG_TEMPLATE,
65     "u": SRE_FLAG_UNICODE,
66 }
67 
68 TYPE_FLAGS = SRE_FLAG_ASCII | SRE_FLAG_LOCALE | SRE_FLAG_UNICODE
69 GLOBAL_FLAGS = SRE_FLAG_DEBUG | SRE_FLAG_TEMPLATE
70 
71 class Verbose(Exception):
72     pass
73 
74 class State:
75     # keeps track of state for parsing
76     def __init__(self):
77         self.flags = 0
78         self.groupdict = {}
79         self.groupwidths = [None]  # group 0
80         self.lookbehindgroups = None
81     @property
82     def groups(self):
83         return len(self.groupwidths)
84     def opengroup(self, name=None):
85         gid = self.groups
86         self.groupwidths.append(None)
87         if self.groups > MAXGROUPS:
88             raise error("too many groups")
89         if name is not None:
90             ogid = self.groupdict.get(name, None)
91             if ogid is not None:
92                 raise error("redefinition of group name %r as group %d; "
93                             "was group %d" % (name, gid,  ogid))
94             self.groupdict[name] = gid
95         return gid
96     def closegroup(self, gid, p):
97         self.groupwidths[gid] = p.getwidth()
98     def checkgroup(self, gid):
99         return gid < self.groups and self.groupwidths[gid] is not None
100 
101     def checklookbehindgroup(self, gid, source):
102         if self.lookbehindgroups is not None:
103             if not self.checkgroup(gid):
104                 raise source.error('cannot refer to an open group')
105             if gid >= self.lookbehindgroups:
106                 raise source.error('cannot refer to group defined in the same '
107                                    'lookbehind subpattern')
108 
109 class SubPattern:
110     # a subpattern, in intermediate form
111     def __init__(self, state, data=None):
112         self.state = state
113         if data is None:
114             data = []
115         self.data = data
116         self.width = None
117 
118     def dump(self, level=0):
119         nl = True
120         seqtypes = (tuple, list)
121         for op, av in self.data:
122             print(level*"  " + str(op), end='')
123             if op is IN:
124                 # member sublanguage
125                 print()
126                 for op, a in av:
127                     print((level+1)*"  " + str(op), a)
128             elif op is BRANCH:
129                 print()
130                 for i, a in enumerate(av[1]):
131                     if i:
132                         print(level*"  " + "OR")
133                     a.dump(level+1)
134             elif op is GROUPREF_EXISTS:
135                 condgroup, item_yes, item_no = av
136                 print('', condgroup)
137                 item_yes.dump(level+1)
138                 if item_no:
139                     print(level*"  " + "ELSE")
140                     item_no.dump(level+1)
141             elif isinstance(av, seqtypes):
142                 nl = False
143                 for a in av:
144                     if isinstance(a, SubPattern):
145                         if not nl:
146                             print()
147                         a.dump(level+1)
148                         nl = True
149                     else:
150                         if not nl:
151                             print(' ', end='')
152                         print(a, end='')
153                         nl = False
154                 if not nl:
155                     print()
156             else:
157                 print('', av)
158     def __repr__(self):
159         return repr(self.data)
160     def __len__(self):
161         return len(self.data)
162     def __delitem__(self, index):
163         del self.data[index]
164     def __getitem__(self, index):
165         if isinstance(index, slice):
166             return SubPattern(self.state, self.data[index])
167         return self.data[index]
168     def __setitem__(self, index, code):
169         self.data[index] = code
170     def insert(self, index, code):
171         self.data.insert(index, code)
172     def append(self, code):
173         self.data.append(code)
174     def getwidth(self):
175         # determine the width (min, max) for this subpattern
176         if self.width is not None:
177             return self.width
178         lo = hi = 0
179         for op, av in self.data:
180             if op is BRANCH:
181                 i = MAXREPEAT - 1
182                 j = 0
183                 for av in av[1]:
184                     l, h = av.getwidth()
185                     i = min(i, l)
186                     j = max(j, h)
187                 lo = lo + i
188                 hi = hi + j
189             elif op is CALL:
190                 i, j = av.getwidth()
191                 lo = lo + i
192                 hi = hi + j
193             elif op is SUBPATTERN:
194                 i, j = av[-1].getwidth()
195                 lo = lo + i
196                 hi = hi + j
197             elif op in _REPEATCODES:
198                 i, j = av[2].getwidth()
199                 lo = lo + i * av[0]
200                 hi = hi + j * av[1]
201             elif op in _UNITCODES:
202                 lo = lo + 1
203                 hi = hi + 1
204             elif op is GROUPREF:
205                 i, j = self.state.groupwidths[av]
206                 lo = lo + i
207                 hi = hi + j
208             elif op is GROUPREF_EXISTS:
209                 i, j = av[1].getwidth()
210                 if av[2] is not None:
211                     l, h = av[2].getwidth()
212                     i = min(i, l)
213                     j = max(j, h)
214                 else:
215                     i = 0
216                 lo = lo + i
217                 hi = hi + j
218             elif op is SUCCESS:
219                 break
220         self.width = min(lo, MAXREPEAT - 1), min(hi, MAXREPEAT)
221         return self.width
222 
223 class Tokenizer:
224     def __init__(self, string):
225         self.istext = isinstance(string, str)
226         self.string = string
227         if not self.istext:
228             string = str(string, 'latin1')
229         self.decoded_string = string
230         self.index = 0
231         self.next = None
232         self.__next()
233     def __next(self):
234         index = self.index
235         try:
236             char = self.decoded_string[index]
237         except IndexError:
238             self.next = None
239             return
240         if char == "\\":
241             index += 1
242             try:
243                 char += self.decoded_string[index]
244             except IndexError:
245                 raise error("bad escape (end of pattern)",
246                             self.string, len(self.string) - 1) from None
247         self.index = index + 1
248         self.next = char
249     def match(self, char):
250         if char == self.next:
251             self.__next()
252             return True
253         return False
254     def get(self):
255         this = self.next
256         self.__next()
257         return this
258     def getwhile(self, n, charset):
259         result = ''
260         for _ in range(n):
261             c = self.next
262             if c not in charset:
263                 break
264             result += c
265             self.__next()
266         return result
267     def getuntil(self, terminator, name):
268         result = ''
269         while True:
270             c = self.next
271             self.__next()
272             if c is None:
273                 if not result:
274                     raise self.error("missing " + name)
275                 raise self.error("missing %s, unterminated name" % terminator,
276                                  len(result))
277             if c == terminator:
278                 if not result:
279                     raise self.error("missing " + name, 1)
280                 break
281             result += c
282         return result
283     @property
284     def pos(self):
285         return self.index - len(self.next or '')
286     def tell(self):
287         return self.index - len(self.next or '')
288     def seek(self, index):
289         self.index = index
290         self.__next()
291 
292     def error(self, msg, offset=0):
293         return error(msg, self.string, self.tell() - offset)
294 
295 def _class_escape(source, escape):
296     # handle escape code inside character class
297     code = ESCAPES.get(escape)
298     if code:
299         return code
300     code = CATEGORIES.get(escape)
301     if code and code[0] is IN:
302         return code
303     try:
304         c = escape[1:2]
305         if c == "x":
306             # hexadecimal escape (exactly two digits)
307             escape += source.getwhile(2, HEXDIGITS)
308             if len(escape) != 4:
309                 raise source.error("incomplete escape %s" % escape, len(escape))
310             return LITERAL, int(escape[2:], 16)
311         elif c == "u" and source.istext:
312             # unicode escape (exactly four digits)
313             escape += source.getwhile(4, HEXDIGITS)
314             if len(escape) != 6:
315                 raise source.error("incomplete escape %s" % escape, len(escape))
316             return LITERAL, int(escape[2:], 16)
317         elif c == "U" and source.istext:
318             # unicode escape (exactly eight digits)
319             escape += source.getwhile(8, HEXDIGITS)
320             if len(escape) != 10:
321                 raise source.error("incomplete escape %s" % escape, len(escape))
322             c = int(escape[2:], 16)
323             chr(c) # raise ValueError for invalid code
324             return LITERAL, c
325         elif c == "N" and source.istext:
326             import unicodedata
327             # named unicode escape e.g. \N{EM DASH}
328             if not source.match('{'):
329                 raise source.error("missing {")
330             charname = source.getuntil('}', 'character name')
331             try:
332                 c = ord(unicodedata.lookup(charname))
333             except KeyError:
334                 raise source.error("undefined character name %r" % charname,
335                                    len(charname) + len(r'\N{}'))
336             return LITERAL, c
337         elif c in OCTDIGITS:
338             # octal escape (up to three digits)
339             escape += source.getwhile(2, OCTDIGITS)
340             c = int(escape[1:], 8)
341             if c > 0o377:
342                 raise source.error('octal escape value %s outside of '
343                                    'range 0-0o377' % escape, len(escape))
344             return LITERAL, c
345         elif c in DIGITS:
346             raise ValueError
347         if len(escape) == 2:
348             if c in ASCIILETTERS:
349                 raise source.error('bad escape %s' % escape, len(escape))
350             return LITERAL, ord(escape[1])
351     except ValueError:
352         pass
353     raise source.error("bad escape %s" % escape, len(escape))
354 
355 def _escape(source, escape, state):
356     # handle escape code in expression
357     code = CATEGORIES.get(escape)
358     if code:
359         return code
360     code = ESCAPES.get(escape)
361     if code:
362         return code
363     try:
364         c = escape[1:2]
365         if c == "x":
366             # hexadecimal escape
367             escape += source.getwhile(2, HEXDIGITS)
368             if len(escape) != 4:
369                 raise source.error("incomplete escape %s" % escape, len(escape))
370             return LITERAL, int(escape[2:], 16)
371         elif c == "u" and source.istext:
372             # unicode escape (exactly four digits)
373             escape += source.getwhile(4, HEXDIGITS)
374             if len(escape) != 6:
375                 raise source.error("incomplete escape %s" % escape, len(escape))
376             return LITERAL, int(escape[2:], 16)
377         elif c == "U" and source.istext:
378             # unicode escape (exactly eight digits)
379             escape += source.getwhile(8, HEXDIGITS)
380             if len(escape) != 10:
381                 raise source.error("incomplete escape %s" % escape, len(escape))
382             c = int(escape[2:], 16)
383             chr(c) # raise ValueError for invalid code
384             return LITERAL, c
385         elif c == "N" and source.istext:
386             import unicodedata
387             # named unicode escape e.g. \N{EM DASH}
388             if not source.match('{'):
389                 raise source.error("missing {")
390             charname = source.getuntil('}', 'character name')
391             try:
392                 c = ord(unicodedata.lookup(charname))
393             except KeyError:
394                 raise source.error("undefined character name %r" % charname,
395                                    len(charname) + len(r'\N{}'))
396             return LITERAL, c
397         elif c == "0":
398             # octal escape
399             escape += source.getwhile(2, OCTDIGITS)
400             return LITERAL, int(escape[1:], 8)
401         elif c in DIGITS:
402             # octal escape *or* decimal group reference (sigh)
403             if source.next in DIGITS:
404                 escape += source.get()
405                 if (escape[1] in OCTDIGITS and escape[2] in OCTDIGITS and
406                     source.next in OCTDIGITS):
407                     # got three octal digits; this is an octal escape
408                     escape += source.get()
409                     c = int(escape[1:], 8)
410                     if c > 0o377:
411                         raise source.error('octal escape value %s outside of '
412                                            'range 0-0o377' % escape,
413                                            len(escape))
414                     return LITERAL, c
415             # not an octal escape, so this is a group reference
416             group = int(escape[1:])
417             if group < state.groups:
418                 if not state.checkgroup(group):
419                     raise source.error("cannot refer to an open group",
420                                        len(escape))
421                 state.checklookbehindgroup(group, source)
422                 return GROUPREF, group
423             raise source.error("invalid group reference %d" % group, len(escape) - 1)
424         if len(escape) == 2:
425             if c in ASCIILETTERS:
426                 raise source.error("bad escape %s" % escape, len(escape))
427             return LITERAL, ord(escape[1])
428     except ValueError:
429         pass
430     raise source.error("bad escape %s" % escape, len(escape))
431 
432 def _uniq(items):
433     return list(dict.fromkeys(items))
434 
435 def _parse_sub(source, state, verbose, nested):
436     # parse an alternation: a|b|c
437 
438     items = []
439     itemsappend = items.append
440     sourcematch = source.match
441     start = source.tell()
442     while True:
443         itemsappend(_parse(source, state, verbose, nested + 1,
444                            not nested and not items))
445         if not sourcematch("|"):
446             break
447 
448     if len(items) == 1:
449         return items[0]
450 
451     subpattern = SubPattern(state)
452 
453     # check if all items share a common prefix
454     while True:
455         prefix = None
456         for item in items:
457             if not item:
458                 break
459             if prefix is None:
460                 prefix = item[0]
461             elif item[0] != prefix:
462                 break
463         else:
464             # all subitems start with a common "prefix".
465             # move it out of the branch
466             for item in items:
467                 del item[0]
468             subpattern.append(prefix)
469             continue # check next one
470         break
471 
472     # check if the branch can be replaced by a character set
473     set = []
474     for item in items:
475         if len(item) != 1:
476             break
477         op, av = item[0]
478         if op is LITERAL:
479             set.append((op, av))
480         elif op is IN and av[0][0] is not NEGATE:
481             set.extend(av)
482         else:
483             break
484     else:
485         # we can store this as a character set instead of a
486         # branch (the compiler may optimize this even more)
487         subpattern.append((IN, _uniq(set)))
488         return subpattern
489 
490     subpattern.append((BRANCH, (None, items)))
491     return subpattern
492 
493 def _parse(source, state, verbose, nested, first=False):
494     # parse a simple pattern
495     subpattern = SubPattern(state)
496 
497     # precompute constants into local variables
498     subpatternappend = subpattern.append
499     sourceget = source.get
500     sourcematch = source.match
501     _len = len
502     _ord = ord
503 
504     while True:
505 
506         this = source.next
507         if this is None:
508             break # end of pattern
509         if this in "|)":
510             break # end of subpattern
511         sourceget()
512 
513         if verbose:
514             # skip whitespace and comments
515             if this in WHITESPACE:
516                 continue
517             if this == "#":
518                 while True:
519                     this = sourceget()
520                     if this is None or this == "\n":
521                         break
522                 continue
523 
524         if this[0] == "\\":
525             code = _escape(source, this, state)
526             subpatternappend(code)
527 
528         elif this not in SPECIAL_CHARS:
529             subpatternappend((LITERAL, _ord(this)))
530 
531         elif this == "[":
532             here = source.tell() - 1
533             # character set
534             set = []
535             setappend = set.append
536 ##          if sourcematch(":"):
537 ##              pass # handle character classes
538             if source.next == '[':
539                 import warnings
540                 warnings.warn(
541                     'Possible nested set at position %d' % source.tell(),
542                     FutureWarning, stacklevel=nested + 6
543                 )
544             negate = sourcematch("^")
545             # check remaining characters
546             while True:
547                 this = sourceget()
548                 if this is None:
549                     raise source.error("unterminated character set",
550                                        source.tell() - here)
551                 if this == "]" and set:
552                     break
553                 elif this[0] == "\\":
554                     code1 = _class_escape(source, this)
555                 else:
556                     if set and this in '-&~|' and source.next == this:
557                         import warnings
558                         warnings.warn(
559                             'Possible set %s at position %d' % (
560                                 'difference' if this == '-' else
561                                 'intersection' if this == '&' else
562                                 'symmetric difference' if this == '~' else
563                                 'union',
564                                 source.tell() - 1),
565                             FutureWarning, stacklevel=nested + 6
566                         )
567                     code1 = LITERAL, _ord(this)
568                 if sourcematch("-"):
569                     # potential range
570                     that = sourceget()
571                     if that is None:
572                         raise source.error("unterminated character set",
573                                            source.tell() - here)
574                     if that == "]":
575                         if code1[0] is IN:
576                             code1 = code1[1][0]
577                         setappend(code1)
578                         setappend((LITERAL, _ord("-")))
579                         break
580                     if that[0] == "\\":
581                         code2 = _class_escape(source, that)
582                     else:
583                         if that == '-':
584                             import warnings
585                             warnings.warn(
586                                 'Possible set difference at position %d' % (
587                                     source.tell() - 2),
588                                 FutureWarning, stacklevel=nested + 6
589                             )
590                         code2 = LITERAL, _ord(that)
591                     if code1[0] != LITERAL or code2[0] != LITERAL:
592                         msg = "bad character range %s-%s" % (this, that)
593                         raise source.error(msg, len(this) + 1 + len(that))
594                     lo = code1[1]
595                     hi = code2[1]
596                     if hi < lo:
597                         msg = "bad character range %s-%s" % (this, that)
598                         raise source.error(msg, len(this) + 1 + len(that))
599                     setappend((RANGE, (lo, hi)))
600                 else:
601                     if code1[0] is IN:
602                         code1 = code1[1][0]
603                     setappend(code1)
604 
605             set = _uniq(set)
606             # XXX: <fl> should move set optimization to compiler!
607             if _len(set) == 1 and set[0][0] is LITERAL:
608                 # optimization
609                 if negate:
610                     subpatternappend((NOT_LITERAL, set[0][1]))
611                 else:
612                     subpatternappend(set[0])
613             else:
614                 if negate:
615                     set.insert(0, (NEGATE, None))
616                 # charmap optimization can't be added here because
617                 # global flags still are not known
618                 subpatternappend((IN, set))
619 
620         elif this in REPEAT_CHARS:
621             # repeat previous item
622             here = source.tell()
623             if this == "?":
624                 min, max = 0, 1
625             elif this == "*":
626                 min, max = 0, MAXREPEAT
627 
628             elif this == "+":
629                 min, max = 1, MAXREPEAT
630             elif this == "{":
631                 if source.next == "}":
632                     subpatternappend((LITERAL, _ord(this)))
633                     continue
634 
635                 min, max = 0, MAXREPEAT
636                 lo = hi = ""
637                 while source.next in DIGITS:
638                     lo += sourceget()
639                 if sourcematch(","):
640                     while source.next in DIGITS:
641                         hi += sourceget()
642                 else:
643                     hi = lo
644                 if not sourcematch("}"):
645                     subpatternappend((LITERAL, _ord(this)))
646                     source.seek(here)
647                     continue
648 
649                 if lo:
650                     min = int(lo)
651                     if min >= MAXREPEAT:
652                         raise OverflowError("the repetition number is too large")
653                 if hi:
654                     max = int(hi)
655                     if max >= MAXREPEAT:
656                         raise OverflowError("the repetition number is too large")
657                     if max < min:
658                         raise source.error("min repeat greater than max repeat",
659                                            source.tell() - here)
660             else:
661                 raise AssertionError("unsupported quantifier %r" % (char,))
662             # figure out which item to repeat
663             if subpattern:
664                 item = subpattern[-1:]
665             else:
666                 item = None
667             if not item or item[0][0] is AT:
668                 raise source.error("nothing to repeat",
669                                    source.tell() - here + len(this))
670             if item[0][0] in _REPEATCODES:
671                 raise source.error("multiple repeat",
672                                    source.tell() - here + len(this))
673             if item[0][0] is SUBPATTERN:
674                 group, add_flags, del_flags, p = item[0][1]
675                 if group is None and not add_flags and not del_flags:
676                     item = p
677             if sourcematch("?"):
678                 subpattern[-1] = (MIN_REPEAT, (min, max, item))
679             else:
680                 subpattern[-1] = (MAX_REPEAT, (min, max, item))
681 
682         elif this == ".":
683             subpatternappend((ANY, None))
684 
685         elif this == "(":
686             start = source.tell() - 1
687             group = True
688             name = None
689             add_flags = 0
690             del_flags = 0
691             if sourcematch("?"):
692                 # options
693                 char = sourceget()
694                 if char is None:
695                     raise source.error("unexpected end of pattern")
696                 if char == "P":
697                     # python extensions
698                     if sourcematch("<"):
699                         # named group: skip forward to end of name
700                         name = source.getuntil(">", "group name")
701                         if not name.isidentifier():
702                             msg = "bad character in group name %r" % name
703                             raise source.error(msg, len(name) + 1)
704                     elif sourcematch("="):
705                         # named backreference
706                         name = source.getuntil(")", "group name")
707                         if not name.isidentifier():
708                             msg = "bad character in group name %r" % name
709                             raise source.error(msg, len(name) + 1)
710                         gid = state.groupdict.get(name)
711                         if gid is None:
712                             msg = "unknown group name %r" % name
713                             raise source.error(msg, len(name) + 1)
714                         if not state.checkgroup(gid):
715                             raise source.error("cannot refer to an open group",
716                                                len(name) + 1)
717                         state.checklookbehindgroup(gid, source)
718                         subpatternappend((GROUPREF, gid))
719                         continue
720 
721                     else:
722                         char = sourceget()
723                         if char is None:
724                             raise source.error("unexpected end of pattern")
725                         raise source.error("unknown extension ?P" + char,
726                                            len(char) + 2)
727                 elif char == ":":
728                     # non-capturing group
729                     group = None
730                 elif char == "#":
731                     # comment
732                     while True:
733                         if source.next is None:
734                             raise source.error("missing ), unterminated comment",
735                                                source.tell() - start)
736                         if sourceget() == ")":
737                             break
738                     continue
739 
740                 elif char in "=!<":
741                     # lookahead assertions
742                     dir = 1
743                     if char == "<":
744                         char = sourceget()
745                         if char is None:
746                             raise source.error("unexpected end of pattern")
747                         if char not in "=!":
748                             raise source.error("unknown extension ?<" + char,
749                                                len(char) + 2)
750                         dir = -1 # lookbehind
751                         lookbehindgroups = state.lookbehindgroups
752                         if lookbehindgroups is None:
753                             state.lookbehindgroups = state.groups
754                     p = _parse_sub(source, state, verbose, nested + 1)
755                     if dir < 0:
756                         if lookbehindgroups is None:
757                             state.lookbehindgroups = None
758                     if not sourcematch(")"):
759                         raise source.error("missing ), unterminated subpattern",
760                                            source.tell() - start)
761                     if char == "=":
762                         subpatternappend((ASSERT, (dir, p)))
763                     else:
764                         subpatternappend((ASSERT_NOT, (dir, p)))
765                     continue
766 
767                 elif char == "(":
768                     # conditional backreference group
769                     condname = source.getuntil(")", "group name")
770                     if condname.isidentifier():
771                         condgroup = state.groupdict.get(condname)
772                         if condgroup is None:
773                             msg = "unknown group name %r" % condname
774                             raise source.error(msg, len(condname) + 1)
775                     else:
776                         try:
777                             condgroup = int(condname)
778                             if condgroup < 0:
779                                 raise ValueError
780                         except ValueError:
781                             msg = "bad character in group name %r" % condname
782                             raise source.error(msg, len(condname) + 1) from None
783                         if not condgroup:
784                             raise source.error("bad group number",
785                                                len(condname) + 1)
786                         if condgroup >= MAXGROUPS:
787                             msg = "invalid group reference %d" % condgroup
788                             raise source.error(msg, len(condname) + 1)
789                     state.checklookbehindgroup(condgroup, source)
790                     item_yes = _parse(source, state, verbose, nested + 1)
791                     if source.match("|"):
792                         item_no = _parse(source, state, verbose, nested + 1)
793                         if source.next == "|":
794                             raise source.error("conditional backref with more than two branches")
795                     else:
796                         item_no = None
797                     if not source.match(")"):
798                         raise source.error("missing ), unterminated subpattern",
799                                            source.tell() - start)
800                     subpatternappend((GROUPREF_EXISTS, (condgroup, item_yes, item_no)))
801                     continue
802 
803                 elif char in FLAGS or char == "-":
804                     # flags
805                     flags = _parse_flags(source, state, char)
806                     if flags is None:  # global flags
807                         if not first or subpattern:
808                             import warnings
809                             warnings.warn(
810                                 'Flags not at the start of the expression %r%s' % (
811                                     source.string[:20],  # truncate long regexes
812                                     ' (truncated)' if len(source.string) > 20 else '',
813                                 ),
814                                 DeprecationWarning, stacklevel=nested + 6
815                             )
816                         if (state.flags & SRE_FLAG_VERBOSE) and not verbose:
817                             raise Verbose
818                         continue
819 
820                     add_flags, del_flags = flags
821                     group = None
822                 else:
823                     raise source.error("unknown extension ?" + char,
824                                        len(char) + 1)
825 
826             # parse group contents
827             if group is not None:
828                 try:
829                     group = state.opengroup(name)
830                 except error as err:
831                     raise source.error(err.msg, len(name) + 1) from None
832             sub_verbose = ((verbose or (add_flags & SRE_FLAG_VERBOSE)) and
833                            not (del_flags & SRE_FLAG_VERBOSE))
834             p = _parse_sub(source, state, sub_verbose, nested + 1)
835             if not source.match(")"):
836                 raise source.error("missing ), unterminated subpattern",
837                                    source.tell() - start)
838             if group is not None:
839                 state.closegroup(group, p)
840             subpatternappend((SUBPATTERN, (group, add_flags, del_flags, p)))
841 
842         elif this == "^":
843             subpatternappend((AT, AT_BEGINNING))
844 
845         elif this == "$":
846             subpatternappend((AT, AT_END))
847 
848         else:
849             raise AssertionError("unsupported special character %r" % (char,))
850 
851     # unpack non-capturing groups
852     for i in range(len(subpattern))[::-1]:
853         op, av = subpattern[i]
854         if op is SUBPATTERN:
855             group, add_flags, del_flags, p = av
856             if group is None and not add_flags and not del_flags:
857                 subpattern[i: i+1] = p
858 
859     return subpattern
860 
861 def _parse_flags(source, state, char):
862     sourceget = source.get
863     add_flags = 0
864     del_flags = 0
865     if char != "-":
866         while True:
867             flag = FLAGS[char]
868             if source.istext:
869                 if char == 'L':
870                     msg = "bad inline flags: cannot use 'L' flag with a str pattern"
871                     raise source.error(msg)
872             else:
873                 if char == 'u':
874                     msg = "bad inline flags: cannot use 'u' flag with a bytes pattern"
875                     raise source.error(msg)
876             add_flags |= flag
877             if (flag & TYPE_FLAGS) and (add_flags & TYPE_FLAGS) != flag:
878                 msg = "bad inline flags: flags 'a', 'u' and 'L' are incompatible"
879                 raise source.error(msg)
880             char = sourceget()
881             if char is None:
882                 raise source.error("missing -, : or )")
883             if char in ")-:":
884                 break
885             if char not in FLAGS:
886                 msg = "unknown flag" if char.isalpha() else "missing -, : or )"
887                 raise source.error(msg, len(char))
888     if char == ")":
889         state.flags |= add_flags
890         return None
891     if add_flags & GLOBAL_FLAGS:
892         raise source.error("bad inline flags: cannot turn on global flag", 1)
893     if char == "-":
894         char = sourceget()
895         if char is None:
896             raise source.error("missing flag")
897         if char not in FLAGS:
898             msg = "unknown flag" if char.isalpha() else "missing flag"
899             raise source.error(msg, len(char))
900         while True:
901             flag = FLAGS[char]
902             if flag & TYPE_FLAGS:
903                 msg = "bad inline flags: cannot turn off flags 'a', 'u' and 'L'"
904                 raise source.error(msg)
905             del_flags |= flag
906             char = sourceget()
907             if char is None:
908                 raise source.error("missing :")
909             if char == ":":
910                 break
911             if char not in FLAGS:
912                 msg = "unknown flag" if char.isalpha() else "missing :"
913                 raise source.error(msg, len(char))
914     assert char == ":"
915     if del_flags & GLOBAL_FLAGS:
916         raise source.error("bad inline flags: cannot turn off global flag", 1)
917     if add_flags & del_flags:
918         raise source.error("bad inline flags: flag turned on and off", 1)
919     return add_flags, del_flags
920 
921 def fix_flags(src, flags):
922     # Check and fix flags according to the type of pattern (str or bytes)
923     if isinstance(src, str):
924         if flags & SRE_FLAG_LOCALE:
925             raise ValueError("cannot use LOCALE flag with a str pattern")
926         if not flags & SRE_FLAG_ASCII:
927             flags |= SRE_FLAG_UNICODE
928         elif flags & SRE_FLAG_UNICODE:
929             raise ValueError("ASCII and UNICODE flags are incompatible")
930     else:
931         if flags & SRE_FLAG_UNICODE:
932             raise ValueError("cannot use UNICODE flag with a bytes pattern")
933         if flags & SRE_FLAG_LOCALE and flags & SRE_FLAG_ASCII:
934             raise ValueError("ASCII and LOCALE flags are incompatible")
935     return flags
936 
937 def parse(str, flags=0, state=None):
938     # parse 're' pattern into list of (opcode, argument) tuples
939 
940     source = Tokenizer(str)
941 
942     if state is None:
943         state = State()
944     state.flags = flags
945     state.str = str
946 
947     try:
948         p = _parse_sub(source, state, flags & SRE_FLAG_VERBOSE, 0)
949     except Verbose:
950         # the VERBOSE flag was switched on inside the pattern.  to be
951         # on the safe side, we'll parse the whole thing again...
952         state = State()
953         state.flags = flags | SRE_FLAG_VERBOSE
954         state.str = str
955         source.seek(0)
956         p = _parse_sub(source, state, True, 0)
957 
958     p.state.flags = fix_flags(str, p.state.flags)
959 
960     if source.next is not None:
961         assert source.next == ")"
962         raise source.error("unbalanced parenthesis")
963 
964     if flags & SRE_FLAG_DEBUG:
965         p.dump()
966 
967     return p
968 
969 def parse_template(source, state):
970     # parse 're' replacement string into list of literals and
971     # group references
972     s = Tokenizer(source)
973     sget = s.get
974     groups = []
975     literals = []
976     literal = []
977     lappend = literal.append
978     def addgroup(index, pos):
979         if index > state.groups:
980             raise s.error("invalid group reference %d" % index, pos)
981         if literal:
982             literals.append(''.join(literal))
983             del literal[:]
984         groups.append((len(literals), index))
985         literals.append(None)
986     groupindex = state.groupindex
987     while True:
988         this = sget()
989         if this is None:
990             break # end of replacement string
991         if this[0] == "\\":
992             # group
993             c = this[1]
994             if c == "g":
995                 name = ""
996                 if not s.match("<"):
997                     raise s.error("missing <")
998                 name = s.getuntil(">", "group name")
999                 if name.isidentifier():
1000                     try:
1001                         index = groupindex[name]
1002                     except KeyError:
1003                         raise IndexError("unknown group name %r" % name)
1004                 else:
1005                     try:
1006                         index = int(name)
1007                         if index < 0:
1008                             raise ValueError
1009                     except ValueError:
1010                         raise s.error("bad character in group name %r" % name,
1011                                       len(name) + 1) from None
1012                     if index >= MAXGROUPS:
1013                         raise s.error("invalid group reference %d" % index,
1014                                       len(name) + 1)
1015                 addgroup(index, len(name) + 1)
1016             elif c == "0":
1017                 if s.next in OCTDIGITS:
1018                     this += sget()
1019                     if s.next in OCTDIGITS:
1020                         this += sget()
1021                 lappend(chr(int(this[1:], 8) & 0xff))
1022             elif c in DIGITS:
1023                 isoctal = False
1024                 if s.next in DIGITS:
1025                     this += sget()
1026                     if (c in OCTDIGITS and this[2] in OCTDIGITS and
1027                         s.next in OCTDIGITS):
1028                         this += sget()
1029                         isoctal = True
1030                         c = int(this[1:], 8)
1031                         if c > 0o377:
1032                             raise s.error('octal escape value %s outside of '
1033                                           'range 0-0o377' % this, len(this))
1034                         lappend(chr(c))
1035                 if not isoctal:
1036                     addgroup(int(this[1:]), len(this) - 1)
1037             else:
1038                 try:
1039                     this = chr(ESCAPES[this][1])
1040                 except KeyError:
1041                     if c in ASCIILETTERS:
1042                         raise s.error('bad escape %s' % this, len(this))
1043                 lappend(this)
1044         else:
1045             lappend(this)
1046     if literal:
1047         literals.append(''.join(literal))
1048     if not isinstance(source, str):
1049         # The tokenizer implicitly decodes bytes objects as latin-1, we must
1050         # therefore re-encode the final representation.
1051         literals = [None if s is None else s.encode('latin-1') for s in literals]
1052     return groups, literals
1053 
1054 def expand_template(template, match):
1055     g = match.group
1056     empty = match.string[:0]
1057     groups, literals = template
1058     literals = literals[:]
1059     try:
1060         for index, group in groups:
1061             literals[index] = g(group) or empty
1062     except IndexError:
1063         raise error("invalid group reference %d" % index)
1064     return empty.join(literals)
1065