• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1from test.support import (gc_collect, bigmemtest, _2G,
2                          cpython_only, captured_stdout)
3import locale
4import re
5import sre_compile
6import string
7import unittest
8import warnings
9from re import Scanner
10from weakref import proxy
11
12# Misc tests from Tim Peters' re.doc
13
14# WARNING: Don't change details in these tests if you don't know
15# what you're doing. Some of these tests were carefully modeled to
16# cover most of the code.
17
18class S(str):
19    def __getitem__(self, index):
20        return S(super().__getitem__(index))
21
22class B(bytes):
23    def __getitem__(self, index):
24        return B(super().__getitem__(index))
25
26class ReTests(unittest.TestCase):
27
28    def assertTypedEqual(self, actual, expect, msg=None):
29        self.assertEqual(actual, expect, msg)
30        def recurse(actual, expect):
31            if isinstance(expect, (tuple, list)):
32                for x, y in zip(actual, expect):
33                    recurse(x, y)
34            else:
35                self.assertIs(type(actual), type(expect), msg)
36        recurse(actual, expect)
37
38    def checkPatternError(self, pattern, errmsg, pos=None):
39        with self.assertRaises(re.error) as cm:
40            re.compile(pattern)
41        with self.subTest(pattern=pattern):
42            err = cm.exception
43            self.assertEqual(err.msg, errmsg)
44            if pos is not None:
45                self.assertEqual(err.pos, pos)
46
47    def checkTemplateError(self, pattern, repl, string, errmsg, pos=None):
48        with self.assertRaises(re.error) as cm:
49            re.sub(pattern, repl, string)
50        with self.subTest(pattern=pattern, repl=repl):
51            err = cm.exception
52            self.assertEqual(err.msg, errmsg)
53            if pos is not None:
54                self.assertEqual(err.pos, pos)
55
56    def test_keep_buffer(self):
57        # See bug 14212
58        b = bytearray(b'x')
59        it = re.finditer(b'a', b)
60        with self.assertRaises(BufferError):
61            b.extend(b'x'*400)
62        list(it)
63        del it
64        gc_collect()
65        b.extend(b'x'*400)
66
67    def test_weakref(self):
68        s = 'QabbbcR'
69        x = re.compile('ab+c')
70        y = proxy(x)
71        self.assertEqual(x.findall('QabbbcR'), y.findall('QabbbcR'))
72
73    def test_search_star_plus(self):
74        self.assertEqual(re.search('x*', 'axx').span(0), (0, 0))
75        self.assertEqual(re.search('x*', 'axx').span(), (0, 0))
76        self.assertEqual(re.search('x+', 'axx').span(0), (1, 3))
77        self.assertEqual(re.search('x+', 'axx').span(), (1, 3))
78        self.assertIsNone(re.search('x', 'aaa'))
79        self.assertEqual(re.match('a*', 'xxx').span(0), (0, 0))
80        self.assertEqual(re.match('a*', 'xxx').span(), (0, 0))
81        self.assertEqual(re.match('x*', 'xxxa').span(0), (0, 3))
82        self.assertEqual(re.match('x*', 'xxxa').span(), (0, 3))
83        self.assertIsNone(re.match('a+', 'xxx'))
84
85    def bump_num(self, matchobj):
86        int_value = int(matchobj.group(0))
87        return str(int_value + 1)
88
89    def test_basic_re_sub(self):
90        self.assertTypedEqual(re.sub('y', 'a', 'xyz'), 'xaz')
91        self.assertTypedEqual(re.sub('y', S('a'), S('xyz')), 'xaz')
92        self.assertTypedEqual(re.sub(b'y', b'a', b'xyz'), b'xaz')
93        self.assertTypedEqual(re.sub(b'y', B(b'a'), B(b'xyz')), b'xaz')
94        self.assertTypedEqual(re.sub(b'y', bytearray(b'a'), bytearray(b'xyz')), b'xaz')
95        self.assertTypedEqual(re.sub(b'y', memoryview(b'a'), memoryview(b'xyz')), b'xaz')
96        for y in ("\xe0", "\u0430", "\U0001d49c"):
97            self.assertEqual(re.sub(y, 'a', 'x%sz' % y), 'xaz')
98
99        self.assertEqual(re.sub("(?i)b+", "x", "bbbb BBBB"), 'x x')
100        self.assertEqual(re.sub(r'\d+', self.bump_num, '08.2 -2 23x99y'),
101                         '9.3 -3 24x100y')
102        self.assertEqual(re.sub(r'\d+', self.bump_num, '08.2 -2 23x99y', 3),
103                         '9.3 -3 23x99y')
104        self.assertEqual(re.sub(r'\d+', self.bump_num, '08.2 -2 23x99y', count=3),
105                         '9.3 -3 23x99y')
106
107        self.assertEqual(re.sub('.', lambda m: r"\n", 'x'), '\\n')
108        self.assertEqual(re.sub('.', r"\n", 'x'), '\n')
109
110        s = r"\1\1"
111        self.assertEqual(re.sub('(.)', s, 'x'), 'xx')
112        self.assertEqual(re.sub('(.)', s.replace('\\', r'\\'), 'x'), s)
113        self.assertEqual(re.sub('(.)', lambda m: s, 'x'), s)
114
115        self.assertEqual(re.sub('(?P<a>x)', r'\g<a>\g<a>', 'xx'), 'xxxx')
116        self.assertEqual(re.sub('(?P<a>x)', r'\g<a>\g<1>', 'xx'), 'xxxx')
117        self.assertEqual(re.sub('(?P<unk>x)', r'\g<unk>\g<unk>', 'xx'), 'xxxx')
118        self.assertEqual(re.sub('(?P<unk>x)', r'\g<1>\g<1>', 'xx'), 'xxxx')
119
120        self.assertEqual(re.sub('a', r'\t\n\v\r\f\a\b', 'a'), '\t\n\v\r\f\a\b')
121        self.assertEqual(re.sub('a', '\t\n\v\r\f\a\b', 'a'), '\t\n\v\r\f\a\b')
122        self.assertEqual(re.sub('a', '\t\n\v\r\f\a\b', 'a'),
123                         (chr(9)+chr(10)+chr(11)+chr(13)+chr(12)+chr(7)+chr(8)))
124        for c in 'cdehijklmopqsuwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ':
125            with self.subTest(c):
126                with self.assertRaises(re.error):
127                    self.assertEqual(re.sub('a', '\\' + c, 'a'), '\\' + c)
128
129        self.assertEqual(re.sub(r'^\s*', 'X', 'test'), 'Xtest')
130
131    def test_bug_449964(self):
132        # fails for group followed by other escape
133        self.assertEqual(re.sub(r'(?P<unk>x)', r'\g<1>\g<1>\b', 'xx'),
134                         'xx\bxx\b')
135
136    def test_bug_449000(self):
137        # Test for sub() on escaped characters
138        self.assertEqual(re.sub(r'\r\n', r'\n', 'abc\r\ndef\r\n'),
139                         'abc\ndef\n')
140        self.assertEqual(re.sub('\r\n', r'\n', 'abc\r\ndef\r\n'),
141                         'abc\ndef\n')
142        self.assertEqual(re.sub(r'\r\n', '\n', 'abc\r\ndef\r\n'),
143                         'abc\ndef\n')
144        self.assertEqual(re.sub('\r\n', '\n', 'abc\r\ndef\r\n'),
145                         'abc\ndef\n')
146
147    def test_bug_1661(self):
148        # Verify that flags do not get silently ignored with compiled patterns
149        pattern = re.compile('.')
150        self.assertRaises(ValueError, re.match, pattern, 'A', re.I)
151        self.assertRaises(ValueError, re.search, pattern, 'A', re.I)
152        self.assertRaises(ValueError, re.findall, pattern, 'A', re.I)
153        self.assertRaises(ValueError, re.compile, pattern, re.I)
154
155    def test_bug_3629(self):
156        # A regex that triggered a bug in the sre-code validator
157        re.compile("(?P<quote>)(?(quote))")
158
159    def test_sub_template_numeric_escape(self):
160        # bug 776311 and friends
161        self.assertEqual(re.sub('x', r'\0', 'x'), '\0')
162        self.assertEqual(re.sub('x', r'\000', 'x'), '\000')
163        self.assertEqual(re.sub('x', r'\001', 'x'), '\001')
164        self.assertEqual(re.sub('x', r'\008', 'x'), '\0' + '8')
165        self.assertEqual(re.sub('x', r'\009', 'x'), '\0' + '9')
166        self.assertEqual(re.sub('x', r'\111', 'x'), '\111')
167        self.assertEqual(re.sub('x', r'\117', 'x'), '\117')
168        self.assertEqual(re.sub('x', r'\377', 'x'), '\377')
169
170        self.assertEqual(re.sub('x', r'\1111', 'x'), '\1111')
171        self.assertEqual(re.sub('x', r'\1111', 'x'), '\111' + '1')
172
173        self.assertEqual(re.sub('x', r'\00', 'x'), '\x00')
174        self.assertEqual(re.sub('x', r'\07', 'x'), '\x07')
175        self.assertEqual(re.sub('x', r'\08', 'x'), '\0' + '8')
176        self.assertEqual(re.sub('x', r'\09', 'x'), '\0' + '9')
177        self.assertEqual(re.sub('x', r'\0a', 'x'), '\0' + 'a')
178
179        self.checkTemplateError('x', r'\400', 'x',
180                                r'octal escape value \400 outside of '
181                                r'range 0-0o377', 0)
182        self.checkTemplateError('x', r'\777', 'x',
183                                r'octal escape value \777 outside of '
184                                r'range 0-0o377', 0)
185
186        self.checkTemplateError('x', r'\1', 'x', 'invalid group reference 1', 1)
187        self.checkTemplateError('x', r'\8', 'x', 'invalid group reference 8', 1)
188        self.checkTemplateError('x', r'\9', 'x', 'invalid group reference 9', 1)
189        self.checkTemplateError('x', r'\11', 'x', 'invalid group reference 11', 1)
190        self.checkTemplateError('x', r'\18', 'x', 'invalid group reference 18', 1)
191        self.checkTemplateError('x', r'\1a', 'x', 'invalid group reference 1', 1)
192        self.checkTemplateError('x', r'\90', 'x', 'invalid group reference 90', 1)
193        self.checkTemplateError('x', r'\99', 'x', 'invalid group reference 99', 1)
194        self.checkTemplateError('x', r'\118', 'x', 'invalid group reference 11', 1)
195        self.checkTemplateError('x', r'\11a', 'x', 'invalid group reference 11', 1)
196        self.checkTemplateError('x', r'\181', 'x', 'invalid group reference 18', 1)
197        self.checkTemplateError('x', r'\800', 'x', 'invalid group reference 80', 1)
198        self.checkTemplateError('x', r'\8', '', 'invalid group reference 8', 1)
199
200        # in python2.3 (etc), these loop endlessly in sre_parser.py
201        self.assertEqual(re.sub('(((((((((((x)))))))))))', r'\11', 'x'), 'x')
202        self.assertEqual(re.sub('((((((((((y))))))))))(.)', r'\118', 'xyz'),
203                         'xz8')
204        self.assertEqual(re.sub('((((((((((y))))))))))(.)', r'\11a', 'xyz'),
205                         'xza')
206
207    def test_qualified_re_sub(self):
208        self.assertEqual(re.sub('a', 'b', 'aaaaa'), 'bbbbb')
209        self.assertEqual(re.sub('a', 'b', 'aaaaa', 1), 'baaaa')
210        self.assertEqual(re.sub('a', 'b', 'aaaaa', count=1), 'baaaa')
211
212    def test_bug_114660(self):
213        self.assertEqual(re.sub(r'(\S)\s+(\S)', r'\1 \2', 'hello  there'),
214                         'hello there')
215
216    def test_symbolic_groups(self):
217        re.compile(r'(?P<a>x)(?P=a)(?(a)y)')
218        re.compile(r'(?P<a1>x)(?P=a1)(?(a1)y)')
219        re.compile(r'(?P<a1>x)\1(?(1)y)')
220        self.checkPatternError(r'(?P<a>)(?P<a>)',
221                               "redefinition of group name 'a' as group 2; "
222                               "was group 1")
223        self.checkPatternError(r'(?P<a>(?P=a))',
224                               "cannot refer to an open group", 10)
225        self.checkPatternError(r'(?Pxy)', 'unknown extension ?Px')
226        self.checkPatternError(r'(?P<a>)(?P=a', 'missing ), unterminated name', 11)
227        self.checkPatternError(r'(?P=', 'missing group name', 4)
228        self.checkPatternError(r'(?P=)', 'missing group name', 4)
229        self.checkPatternError(r'(?P=1)', "bad character in group name '1'", 4)
230        self.checkPatternError(r'(?P=a)', "unknown group name 'a'")
231        self.checkPatternError(r'(?P=a1)', "unknown group name 'a1'")
232        self.checkPatternError(r'(?P=a.)', "bad character in group name 'a.'", 4)
233        self.checkPatternError(r'(?P<)', 'missing >, unterminated name', 4)
234        self.checkPatternError(r'(?P<a', 'missing >, unterminated name', 4)
235        self.checkPatternError(r'(?P<', 'missing group name', 4)
236        self.checkPatternError(r'(?P<>)', 'missing group name', 4)
237        self.checkPatternError(r'(?P<1>)', "bad character in group name '1'", 4)
238        self.checkPatternError(r'(?P<a.>)', "bad character in group name 'a.'", 4)
239        self.checkPatternError(r'(?(', 'missing group name', 3)
240        self.checkPatternError(r'(?())', 'missing group name', 3)
241        self.checkPatternError(r'(?(a))', "unknown group name 'a'", 3)
242        self.checkPatternError(r'(?(-1))', "bad character in group name '-1'", 3)
243        self.checkPatternError(r'(?(1a))', "bad character in group name '1a'", 3)
244        self.checkPatternError(r'(?(a.))', "bad character in group name 'a.'", 3)
245        # New valid/invalid identifiers in Python 3
246        re.compile('(?P<µ>x)(?P=µ)(?(µ)y)')
247        re.compile('(?P<��������������>x)(?P=��������������)(?(��������������)y)')
248        self.checkPatternError('(?P<©>x)', "bad character in group name '©'", 4)
249        # Support > 100 groups.
250        pat = '|'.join('x(?P<a%d>%x)y' % (i, i) for i in range(1, 200 + 1))
251        pat = '(?:%s)(?(200)z|t)' % pat
252        self.assertEqual(re.match(pat, 'xc8yz').span(), (0, 5))
253
254    def test_symbolic_refs(self):
255        self.checkTemplateError('(?P<a>x)', r'\g<a', 'xx',
256                                'missing >, unterminated name', 3)
257        self.checkTemplateError('(?P<a>x)', r'\g<', 'xx',
258                                'missing group name', 3)
259        self.checkTemplateError('(?P<a>x)', r'\g', 'xx', 'missing <', 2)
260        self.checkTemplateError('(?P<a>x)', r'\g<a a>', 'xx',
261                                "bad character in group name 'a a'", 3)
262        self.checkTemplateError('(?P<a>x)', r'\g<>', 'xx',
263                                'missing group name', 3)
264        self.checkTemplateError('(?P<a>x)', r'\g<1a1>', 'xx',
265                                "bad character in group name '1a1'", 3)
266        self.checkTemplateError('(?P<a>x)', r'\g<2>', 'xx',
267                                'invalid group reference 2', 3)
268        self.checkTemplateError('(?P<a>x)', r'\2', 'xx',
269                                'invalid group reference 2', 1)
270        with self.assertRaisesRegex(IndexError, "unknown group name 'ab'"):
271            re.sub('(?P<a>x)', r'\g<ab>', 'xx')
272        self.assertEqual(re.sub('(?P<a>x)|(?P<b>y)', r'\g<b>', 'xx'), '')
273        self.assertEqual(re.sub('(?P<a>x)|(?P<b>y)', r'\2', 'xx'), '')
274        self.checkTemplateError('(?P<a>x)', r'\g<-1>', 'xx',
275                                "bad character in group name '-1'", 3)
276        # New valid/invalid identifiers in Python 3
277        self.assertEqual(re.sub('(?P<µ>x)', r'\g<µ>', 'xx'), 'xx')
278        self.assertEqual(re.sub('(?P<��������������>x)', r'\g<��������������>', 'xx'), 'xx')
279        self.checkTemplateError('(?P<a>x)', r'\g<©>', 'xx',
280                                "bad character in group name '©'", 3)
281        # Support > 100 groups.
282        pat = '|'.join('x(?P<a%d>%x)y' % (i, i) for i in range(1, 200 + 1))
283        self.assertEqual(re.sub(pat, r'\g<200>', 'xc8yzxc8y'), 'c8zc8')
284
285    def test_re_subn(self):
286        self.assertEqual(re.subn("(?i)b+", "x", "bbbb BBBB"), ('x x', 2))
287        self.assertEqual(re.subn("b+", "x", "bbbb BBBB"), ('x BBBB', 1))
288        self.assertEqual(re.subn("b+", "x", "xyz"), ('xyz', 0))
289        self.assertEqual(re.subn("b*", "x", "xyz"), ('xxxyxzx', 4))
290        self.assertEqual(re.subn("b*", "x", "xyz", 2), ('xxxyz', 2))
291        self.assertEqual(re.subn("b*", "x", "xyz", count=2), ('xxxyz', 2))
292
293    def test_re_split(self):
294        for string in ":a:b::c", S(":a:b::c"):
295            self.assertTypedEqual(re.split(":", string),
296                                  ['', 'a', 'b', '', 'c'])
297            self.assertTypedEqual(re.split(":+", string),
298                                  ['', 'a', 'b', 'c'])
299            self.assertTypedEqual(re.split("(:+)", string),
300                                  ['', ':', 'a', ':', 'b', '::', 'c'])
301        for string in (b":a:b::c", B(b":a:b::c"), bytearray(b":a:b::c"),
302                       memoryview(b":a:b::c")):
303            self.assertTypedEqual(re.split(b":", string),
304                                  [b'', b'a', b'b', b'', b'c'])
305            self.assertTypedEqual(re.split(b":+", string),
306                                  [b'', b'a', b'b', b'c'])
307            self.assertTypedEqual(re.split(b"(:+)", string),
308                                  [b'', b':', b'a', b':', b'b', b'::', b'c'])
309        for a, b, c in ("\xe0\xdf\xe7", "\u0430\u0431\u0432",
310                        "\U0001d49c\U0001d49e\U0001d4b5"):
311            string = ":%s:%s::%s" % (a, b, c)
312            self.assertEqual(re.split(":", string), ['', a, b, '', c])
313            self.assertEqual(re.split(":+", string), ['', a, b, c])
314            self.assertEqual(re.split("(:+)", string),
315                             ['', ':', a, ':', b, '::', c])
316
317        self.assertEqual(re.split("(?::+)", ":a:b::c"), ['', 'a', 'b', 'c'])
318        self.assertEqual(re.split("(:)+", ":a:b::c"),
319                         ['', ':', 'a', ':', 'b', ':', 'c'])
320        self.assertEqual(re.split("([b:]+)", ":a:b::c"),
321                         ['', ':', 'a', ':b::', 'c'])
322        self.assertEqual(re.split("(b)|(:+)", ":a:b::c"),
323                         ['', None, ':', 'a', None, ':', '', 'b', None, '',
324                          None, '::', 'c'])
325        self.assertEqual(re.split("(?:b)|(?::+)", ":a:b::c"),
326                         ['', 'a', '', '', 'c'])
327
328        for sep, expected in [
329            (':*', ['', '', 'a', '', 'b', '', 'c', '']),
330            ('(?::*)', ['', '', 'a', '', 'b', '', 'c', '']),
331            ('(:*)', ['', ':', '', '', 'a', ':', '', '', 'b', '::', '', '', 'c', '', '']),
332            ('(:)*', ['', ':', '', None, 'a', ':', '', None, 'b', ':', '', None, 'c', None, '']),
333        ]:
334            with self.subTest(sep=sep):
335                self.assertTypedEqual(re.split(sep, ':a:b::c'), expected)
336
337        for sep, expected in [
338            ('', ['', ':', 'a', ':', 'b', ':', ':', 'c', '']),
339            (r'\b', [':', 'a', ':', 'b', '::', 'c', '']),
340            (r'(?=:)', ['', ':a', ':b', ':', ':c']),
341            (r'(?<=:)', [':', 'a:', 'b:', ':', 'c']),
342        ]:
343            with self.subTest(sep=sep):
344                self.assertTypedEqual(re.split(sep, ':a:b::c'), expected)
345
346    def test_qualified_re_split(self):
347        self.assertEqual(re.split(":", ":a:b::c", 2), ['', 'a', 'b::c'])
348        self.assertEqual(re.split(":", ":a:b::c", maxsplit=2), ['', 'a', 'b::c'])
349        self.assertEqual(re.split(':', 'a:b:c:d', maxsplit=2), ['a', 'b', 'c:d'])
350        self.assertEqual(re.split("(:)", ":a:b::c", maxsplit=2),
351                         ['', ':', 'a', ':', 'b::c'])
352        self.assertEqual(re.split("(:+)", ":a:b::c", maxsplit=2),
353                         ['', ':', 'a', ':', 'b::c'])
354        self.assertEqual(re.split("(:*)", ":a:b::c", maxsplit=2),
355                         ['', ':', '', '', 'a:b::c'])
356
357    def test_re_findall(self):
358        self.assertEqual(re.findall(":+", "abc"), [])
359        for string in "a:b::c:::d", S("a:b::c:::d"):
360            self.assertTypedEqual(re.findall(":+", string),
361                                  [":", "::", ":::"])
362            self.assertTypedEqual(re.findall("(:+)", string),
363                                  [":", "::", ":::"])
364            self.assertTypedEqual(re.findall("(:)(:*)", string),
365                                  [(":", ""), (":", ":"), (":", "::")])
366        for string in (b"a:b::c:::d", B(b"a:b::c:::d"), bytearray(b"a:b::c:::d"),
367                       memoryview(b"a:b::c:::d")):
368            self.assertTypedEqual(re.findall(b":+", string),
369                                  [b":", b"::", b":::"])
370            self.assertTypedEqual(re.findall(b"(:+)", string),
371                                  [b":", b"::", b":::"])
372            self.assertTypedEqual(re.findall(b"(:)(:*)", string),
373                                  [(b":", b""), (b":", b":"), (b":", b"::")])
374        for x in ("\xe0", "\u0430", "\U0001d49c"):
375            xx = x * 2
376            xxx = x * 3
377            string = "a%sb%sc%sd" % (x, xx, xxx)
378            self.assertEqual(re.findall("%s+" % x, string), [x, xx, xxx])
379            self.assertEqual(re.findall("(%s+)" % x, string), [x, xx, xxx])
380            self.assertEqual(re.findall("(%s)(%s*)" % (x, x), string),
381                             [(x, ""), (x, x), (x, xx)])
382
383    def test_bug_117612(self):
384        self.assertEqual(re.findall(r"(a|(b))", "aba"),
385                         [("a", ""),("b", "b"),("a", "")])
386
387    def test_re_match(self):
388        for string in 'a', S('a'):
389            self.assertEqual(re.match('a', string).groups(), ())
390            self.assertEqual(re.match('(a)', string).groups(), ('a',))
391            self.assertEqual(re.match('(a)', string).group(0), 'a')
392            self.assertEqual(re.match('(a)', string).group(1), 'a')
393            self.assertEqual(re.match('(a)', string).group(1, 1), ('a', 'a'))
394        for string in b'a', B(b'a'), bytearray(b'a'), memoryview(b'a'):
395            self.assertEqual(re.match(b'a', string).groups(), ())
396            self.assertEqual(re.match(b'(a)', string).groups(), (b'a',))
397            self.assertEqual(re.match(b'(a)', string).group(0), b'a')
398            self.assertEqual(re.match(b'(a)', string).group(1), b'a')
399            self.assertEqual(re.match(b'(a)', string).group(1, 1), (b'a', b'a'))
400        for a in ("\xe0", "\u0430", "\U0001d49c"):
401            self.assertEqual(re.match(a, a).groups(), ())
402            self.assertEqual(re.match('(%s)' % a, a).groups(), (a,))
403            self.assertEqual(re.match('(%s)' % a, a).group(0), a)
404            self.assertEqual(re.match('(%s)' % a, a).group(1), a)
405            self.assertEqual(re.match('(%s)' % a, a).group(1, 1), (a, a))
406
407        pat = re.compile('((a)|(b))(c)?')
408        self.assertEqual(pat.match('a').groups(), ('a', 'a', None, None))
409        self.assertEqual(pat.match('b').groups(), ('b', None, 'b', None))
410        self.assertEqual(pat.match('ac').groups(), ('a', 'a', None, 'c'))
411        self.assertEqual(pat.match('bc').groups(), ('b', None, 'b', 'c'))
412        self.assertEqual(pat.match('bc').groups(""), ('b', "", 'b', 'c'))
413
414        pat = re.compile('(?:(?P<a1>a)|(?P<b2>b))(?P<c3>c)?')
415        self.assertEqual(pat.match('a').group(1, 2, 3), ('a', None, None))
416        self.assertEqual(pat.match('b').group('a1', 'b2', 'c3'),
417                         (None, 'b', None))
418        self.assertEqual(pat.match('ac').group(1, 'b2', 3), ('a', None, 'c'))
419
420    def test_group(self):
421        class Index:
422            def __init__(self, value):
423                self.value = value
424            def __index__(self):
425                return self.value
426        # A single group
427        m = re.match('(a)(b)', 'ab')
428        self.assertEqual(m.group(), 'ab')
429        self.assertEqual(m.group(0), 'ab')
430        self.assertEqual(m.group(1), 'a')
431        self.assertEqual(m.group(Index(1)), 'a')
432        self.assertRaises(IndexError, m.group, -1)
433        self.assertRaises(IndexError, m.group, 3)
434        self.assertRaises(IndexError, m.group, 1<<1000)
435        self.assertRaises(IndexError, m.group, Index(1<<1000))
436        self.assertRaises(IndexError, m.group, 'x')
437        # Multiple groups
438        self.assertEqual(m.group(2, 1), ('b', 'a'))
439        self.assertEqual(m.group(Index(2), Index(1)), ('b', 'a'))
440
441    def test_match_getitem(self):
442        pat = re.compile('(?:(?P<a1>a)|(?P<b2>b))(?P<c3>c)?')
443
444        m = pat.match('a')
445        self.assertEqual(m['a1'], 'a')
446        self.assertEqual(m['b2'], None)
447        self.assertEqual(m['c3'], None)
448        self.assertEqual('a1={a1} b2={b2} c3={c3}'.format_map(m), 'a1=a b2=None c3=None')
449        self.assertEqual(m[0], 'a')
450        self.assertEqual(m[1], 'a')
451        self.assertEqual(m[2], None)
452        self.assertEqual(m[3], None)
453        with self.assertRaisesRegex(IndexError, 'no such group'):
454            m['X']
455        with self.assertRaisesRegex(IndexError, 'no such group'):
456            m[-1]
457        with self.assertRaisesRegex(IndexError, 'no such group'):
458            m[4]
459        with self.assertRaisesRegex(IndexError, 'no such group'):
460            m[0, 1]
461        with self.assertRaisesRegex(IndexError, 'no such group'):
462            m[(0,)]
463        with self.assertRaisesRegex(IndexError, 'no such group'):
464            m[(0, 1)]
465        with self.assertRaisesRegex(IndexError, 'no such group'):
466            'a1={a2}'.format_map(m)
467
468        m = pat.match('ac')
469        self.assertEqual(m['a1'], 'a')
470        self.assertEqual(m['b2'], None)
471        self.assertEqual(m['c3'], 'c')
472        self.assertEqual('a1={a1} b2={b2} c3={c3}'.format_map(m), 'a1=a b2=None c3=c')
473        self.assertEqual(m[0], 'ac')
474        self.assertEqual(m[1], 'a')
475        self.assertEqual(m[2], None)
476        self.assertEqual(m[3], 'c')
477
478        # Cannot assign.
479        with self.assertRaises(TypeError):
480            m[0] = 1
481
482        # No len().
483        self.assertRaises(TypeError, len, m)
484
485    def test_re_fullmatch(self):
486        # Issue 16203: Proposal: add re.fullmatch() method.
487        self.assertEqual(re.fullmatch(r"a", "a").span(), (0, 1))
488        for string in "ab", S("ab"):
489            self.assertEqual(re.fullmatch(r"a|ab", string).span(), (0, 2))
490        for string in b"ab", B(b"ab"), bytearray(b"ab"), memoryview(b"ab"):
491            self.assertEqual(re.fullmatch(br"a|ab", string).span(), (0, 2))
492        for a, b in "\xe0\xdf", "\u0430\u0431", "\U0001d49c\U0001d49e":
493            r = r"%s|%s" % (a, a + b)
494            self.assertEqual(re.fullmatch(r, a + b).span(), (0, 2))
495        self.assertEqual(re.fullmatch(r".*?$", "abc").span(), (0, 3))
496        self.assertEqual(re.fullmatch(r".*?", "abc").span(), (0, 3))
497        self.assertEqual(re.fullmatch(r"a.*?b", "ab").span(), (0, 2))
498        self.assertEqual(re.fullmatch(r"a.*?b", "abb").span(), (0, 3))
499        self.assertEqual(re.fullmatch(r"a.*?b", "axxb").span(), (0, 4))
500        self.assertIsNone(re.fullmatch(r"a+", "ab"))
501        self.assertIsNone(re.fullmatch(r"abc$", "abc\n"))
502        self.assertIsNone(re.fullmatch(r"abc\Z", "abc\n"))
503        self.assertIsNone(re.fullmatch(r"(?m)abc$", "abc\n"))
504        self.assertEqual(re.fullmatch(r"ab(?=c)cd", "abcd").span(), (0, 4))
505        self.assertEqual(re.fullmatch(r"ab(?<=b)cd", "abcd").span(), (0, 4))
506        self.assertEqual(re.fullmatch(r"(?=a|ab)ab", "ab").span(), (0, 2))
507
508        self.assertEqual(
509            re.compile(r"bc").fullmatch("abcd", pos=1, endpos=3).span(), (1, 3))
510        self.assertEqual(
511            re.compile(r".*?$").fullmatch("abcd", pos=1, endpos=3).span(), (1, 3))
512        self.assertEqual(
513            re.compile(r".*?").fullmatch("abcd", pos=1, endpos=3).span(), (1, 3))
514
515    def test_re_groupref_exists(self):
516        self.assertEqual(re.match(r'^(\()?([^()]+)(?(1)\))$', '(a)').groups(),
517                         ('(', 'a'))
518        self.assertEqual(re.match(r'^(\()?([^()]+)(?(1)\))$', 'a').groups(),
519                         (None, 'a'))
520        self.assertIsNone(re.match(r'^(\()?([^()]+)(?(1)\))$', 'a)'))
521        self.assertIsNone(re.match(r'^(\()?([^()]+)(?(1)\))$', '(a'))
522        self.assertEqual(re.match('^(?:(a)|c)((?(1)b|d))$', 'ab').groups(),
523                         ('a', 'b'))
524        self.assertEqual(re.match(r'^(?:(a)|c)((?(1)b|d))$', 'cd').groups(),
525                         (None, 'd'))
526        self.assertEqual(re.match(r'^(?:(a)|c)((?(1)|d))$', 'cd').groups(),
527                         (None, 'd'))
528        self.assertEqual(re.match(r'^(?:(a)|c)((?(1)|d))$', 'a').groups(),
529                         ('a', ''))
530
531        # Tests for bug #1177831: exercise groups other than the first group
532        p = re.compile('(?P<g1>a)(?P<g2>b)?((?(g2)c|d))')
533        self.assertEqual(p.match('abc').groups(),
534                         ('a', 'b', 'c'))
535        self.assertEqual(p.match('ad').groups(),
536                         ('a', None, 'd'))
537        self.assertIsNone(p.match('abd'))
538        self.assertIsNone(p.match('ac'))
539
540        # Support > 100 groups.
541        pat = '|'.join('x(?P<a%d>%x)y' % (i, i) for i in range(1, 200 + 1))
542        pat = '(?:%s)(?(200)z)' % pat
543        self.assertEqual(re.match(pat, 'xc8yz').span(), (0, 5))
544
545        self.checkPatternError(r'(?P<a>)(?(0))', 'bad group number', 10)
546        self.checkPatternError(r'()(?(1)a|b',
547                               'missing ), unterminated subpattern', 2)
548        self.checkPatternError(r'()(?(1)a|b|c)',
549                               'conditional backref with more than '
550                               'two branches', 10)
551
552    def test_re_groupref_overflow(self):
553        from sre_constants import MAXGROUPS
554        self.checkTemplateError('()', r'\g<%s>' % MAXGROUPS, 'xx',
555                                'invalid group reference %d' % MAXGROUPS, 3)
556        self.checkPatternError(r'(?P<a>)(?(%d))' % MAXGROUPS,
557                               'invalid group reference %d' % MAXGROUPS, 10)
558
559    def test_re_groupref(self):
560        self.assertEqual(re.match(r'^(\|)?([^()]+)\1$', '|a|').groups(),
561                         ('|', 'a'))
562        self.assertEqual(re.match(r'^(\|)?([^()]+)\1?$', 'a').groups(),
563                         (None, 'a'))
564        self.assertIsNone(re.match(r'^(\|)?([^()]+)\1$', 'a|'))
565        self.assertIsNone(re.match(r'^(\|)?([^()]+)\1$', '|a'))
566        self.assertEqual(re.match(r'^(?:(a)|c)(\1)$', 'aa').groups(),
567                         ('a', 'a'))
568        self.assertEqual(re.match(r'^(?:(a)|c)(\1)?$', 'c').groups(),
569                         (None, None))
570
571        self.checkPatternError(r'(abc\1)', 'cannot refer to an open group', 4)
572
573    def test_groupdict(self):
574        self.assertEqual(re.match('(?P<first>first) (?P<second>second)',
575                                  'first second').groupdict(),
576                         {'first':'first', 'second':'second'})
577
578    def test_expand(self):
579        self.assertEqual(re.match("(?P<first>first) (?P<second>second)",
580                                  "first second")
581                                  .expand(r"\2 \1 \g<second> \g<first>"),
582                         "second first second first")
583        self.assertEqual(re.match("(?P<first>first)|(?P<second>second)",
584                                  "first")
585                                  .expand(r"\2 \g<second>"),
586                         " ")
587
588    def test_repeat_minmax(self):
589        self.assertIsNone(re.match(r"^(\w){1}$", "abc"))
590        self.assertIsNone(re.match(r"^(\w){1}?$", "abc"))
591        self.assertIsNone(re.match(r"^(\w){1,2}$", "abc"))
592        self.assertIsNone(re.match(r"^(\w){1,2}?$", "abc"))
593
594        self.assertEqual(re.match(r"^(\w){3}$", "abc").group(1), "c")
595        self.assertEqual(re.match(r"^(\w){1,3}$", "abc").group(1), "c")
596        self.assertEqual(re.match(r"^(\w){1,4}$", "abc").group(1), "c")
597        self.assertEqual(re.match(r"^(\w){3,4}?$", "abc").group(1), "c")
598        self.assertEqual(re.match(r"^(\w){3}?$", "abc").group(1), "c")
599        self.assertEqual(re.match(r"^(\w){1,3}?$", "abc").group(1), "c")
600        self.assertEqual(re.match(r"^(\w){1,4}?$", "abc").group(1), "c")
601        self.assertEqual(re.match(r"^(\w){3,4}?$", "abc").group(1), "c")
602
603        self.assertIsNone(re.match(r"^x{1}$", "xxx"))
604        self.assertIsNone(re.match(r"^x{1}?$", "xxx"))
605        self.assertIsNone(re.match(r"^x{1,2}$", "xxx"))
606        self.assertIsNone(re.match(r"^x{1,2}?$", "xxx"))
607
608        self.assertTrue(re.match(r"^x{3}$", "xxx"))
609        self.assertTrue(re.match(r"^x{1,3}$", "xxx"))
610        self.assertTrue(re.match(r"^x{3,3}$", "xxx"))
611        self.assertTrue(re.match(r"^x{1,4}$", "xxx"))
612        self.assertTrue(re.match(r"^x{3,4}?$", "xxx"))
613        self.assertTrue(re.match(r"^x{3}?$", "xxx"))
614        self.assertTrue(re.match(r"^x{1,3}?$", "xxx"))
615        self.assertTrue(re.match(r"^x{1,4}?$", "xxx"))
616        self.assertTrue(re.match(r"^x{3,4}?$", "xxx"))
617
618        self.assertIsNone(re.match(r"^x{}$", "xxx"))
619        self.assertTrue(re.match(r"^x{}$", "x{}"))
620
621        self.checkPatternError(r'x{2,1}',
622                               'min repeat greater than max repeat', 2)
623
624    def test_getattr(self):
625        self.assertEqual(re.compile("(?i)(a)(b)").pattern, "(?i)(a)(b)")
626        self.assertEqual(re.compile("(?i)(a)(b)").flags, re.I | re.U)
627        self.assertEqual(re.compile("(?i)(a)(b)").groups, 2)
628        self.assertEqual(re.compile("(?i)(a)(b)").groupindex, {})
629        self.assertEqual(re.compile("(?i)(?P<first>a)(?P<other>b)").groupindex,
630                         {'first': 1, 'other': 2})
631
632        self.assertEqual(re.match("(a)", "a").pos, 0)
633        self.assertEqual(re.match("(a)", "a").endpos, 1)
634        self.assertEqual(re.match("(a)", "a").string, "a")
635        self.assertEqual(re.match("(a)", "a").regs, ((0, 1), (0, 1)))
636        self.assertTrue(re.match("(a)", "a").re)
637
638        # Issue 14260. groupindex should be non-modifiable mapping.
639        p = re.compile(r'(?i)(?P<first>a)(?P<other>b)')
640        self.assertEqual(sorted(p.groupindex), ['first', 'other'])
641        self.assertEqual(p.groupindex['other'], 2)
642        with self.assertRaises(TypeError):
643            p.groupindex['other'] = 0
644        self.assertEqual(p.groupindex['other'], 2)
645
646    def test_special_escapes(self):
647        self.assertEqual(re.search(r"\b(b.)\b",
648                                   "abcd abc bcd bx").group(1), "bx")
649        self.assertEqual(re.search(r"\B(b.)\B",
650                                   "abc bcd bc abxd").group(1), "bx")
651        self.assertEqual(re.search(r"\b(b.)\b",
652                                   "abcd abc bcd bx", re.ASCII).group(1), "bx")
653        self.assertEqual(re.search(r"\B(b.)\B",
654                                   "abc bcd bc abxd", re.ASCII).group(1), "bx")
655        self.assertEqual(re.search(r"^abc$", "\nabc\n", re.M).group(0), "abc")
656        self.assertEqual(re.search(r"^\Aabc\Z$", "abc", re.M).group(0), "abc")
657        self.assertIsNone(re.search(r"^\Aabc\Z$", "\nabc\n", re.M))
658        self.assertEqual(re.search(br"\b(b.)\b",
659                                   b"abcd abc bcd bx").group(1), b"bx")
660        self.assertEqual(re.search(br"\B(b.)\B",
661                                   b"abc bcd bc abxd").group(1), b"bx")
662        self.assertEqual(re.search(br"\b(b.)\b",
663                                   b"abcd abc bcd bx", re.LOCALE).group(1), b"bx")
664        self.assertEqual(re.search(br"\B(b.)\B",
665                                   b"abc bcd bc abxd", re.LOCALE).group(1), b"bx")
666        self.assertEqual(re.search(br"^abc$", b"\nabc\n", re.M).group(0), b"abc")
667        self.assertEqual(re.search(br"^\Aabc\Z$", b"abc", re.M).group(0), b"abc")
668        self.assertIsNone(re.search(br"^\Aabc\Z$", b"\nabc\n", re.M))
669        self.assertEqual(re.search(r"\d\D\w\W\s\S",
670                                   "1aa! a").group(0), "1aa! a")
671        self.assertEqual(re.search(br"\d\D\w\W\s\S",
672                                   b"1aa! a").group(0), b"1aa! a")
673        self.assertEqual(re.search(r"\d\D\w\W\s\S",
674                                   "1aa! a", re.ASCII).group(0), "1aa! a")
675        self.assertEqual(re.search(br"\d\D\w\W\s\S",
676                                   b"1aa! a", re.LOCALE).group(0), b"1aa! a")
677
678    def test_other_escapes(self):
679        self.checkPatternError("\\", 'bad escape (end of pattern)', 0)
680        self.assertEqual(re.match(r"\(", '(').group(), '(')
681        self.assertIsNone(re.match(r"\(", ')'))
682        self.assertEqual(re.match(r"\\", '\\').group(), '\\')
683        self.assertEqual(re.match(r"[\]]", ']').group(), ']')
684        self.assertIsNone(re.match(r"[\]]", '['))
685        self.assertEqual(re.match(r"[a\-c]", '-').group(), '-')
686        self.assertIsNone(re.match(r"[a\-c]", 'b'))
687        self.assertEqual(re.match(r"[\^a]+", 'a^').group(), 'a^')
688        self.assertIsNone(re.match(r"[\^a]+", 'b'))
689        re.purge()  # for warnings
690        for c in 'ceghijklmopqyzCEFGHIJKLMNOPQRTVXY':
691            with self.subTest(c):
692                self.assertRaises(re.error, re.compile, '\\%c' % c)
693        for c in 'ceghijklmopqyzABCEFGHIJKLMNOPQRTVXYZ':
694            with self.subTest(c):
695                self.assertRaises(re.error, re.compile, '[\\%c]' % c)
696
697    def test_named_unicode_escapes(self):
698        # test individual Unicode named escapes
699        self.assertTrue(re.match(r'\N{LESS-THAN SIGN}', '<'))
700        self.assertTrue(re.match(r'\N{less-than sign}', '<'))
701        self.assertIsNone(re.match(r'\N{LESS-THAN SIGN}', '>'))
702        self.assertTrue(re.match(r'\N{SNAKE}', '\U0001f40d'))
703        self.assertTrue(re.match(r'\N{ARABIC LIGATURE UIGHUR KIRGHIZ YEH WITH '
704                                 r'HAMZA ABOVE WITH ALEF MAKSURA ISOLATED FORM}',
705                                 '\ufbf9'))
706        self.assertTrue(re.match(r'[\N{LESS-THAN SIGN}-\N{GREATER-THAN SIGN}]',
707                                 '='))
708        self.assertIsNone(re.match(r'[\N{LESS-THAN SIGN}-\N{GREATER-THAN SIGN}]',
709                                   ';'))
710
711        # test errors in \N{name} handling - only valid names should pass
712        self.checkPatternError(r'\N', 'missing {', 2)
713        self.checkPatternError(r'[\N]', 'missing {', 3)
714        self.checkPatternError(r'\N{', 'missing character name', 3)
715        self.checkPatternError(r'[\N{', 'missing character name', 4)
716        self.checkPatternError(r'\N{}', 'missing character name', 3)
717        self.checkPatternError(r'[\N{}]', 'missing character name', 4)
718        self.checkPatternError(r'\NSNAKE}', 'missing {', 2)
719        self.checkPatternError(r'[\NSNAKE}]', 'missing {', 3)
720        self.checkPatternError(r'\N{SNAKE',
721                               'missing }, unterminated name', 3)
722        self.checkPatternError(r'[\N{SNAKE]',
723                               'missing }, unterminated name', 4)
724        self.checkPatternError(r'[\N{SNAKE]}',
725                               "undefined character name 'SNAKE]'", 1)
726        self.checkPatternError(r'\N{SPAM}',
727                               "undefined character name 'SPAM'", 0)
728        self.checkPatternError(r'[\N{SPAM}]',
729                               "undefined character name 'SPAM'", 1)
730        self.checkPatternError(br'\N{LESS-THAN SIGN}', r'bad escape \N', 0)
731        self.checkPatternError(br'[\N{LESS-THAN SIGN}]', r'bad escape \N', 1)
732
733    def test_string_boundaries(self):
734        # See http://bugs.python.org/issue10713
735        self.assertEqual(re.search(r"\b(abc)\b", "abc").group(1),
736                         "abc")
737        # There's a word boundary at the start of a string.
738        self.assertTrue(re.match(r"\b", "abc"))
739        # A non-empty string includes a non-boundary zero-length match.
740        self.assertTrue(re.search(r"\B", "abc"))
741        # There is no non-boundary match at the start of a string.
742        self.assertFalse(re.match(r"\B", "abc"))
743        # However, an empty string contains no word boundaries, and also no
744        # non-boundaries.
745        self.assertIsNone(re.search(r"\B", ""))
746        # This one is questionable and different from the perlre behaviour,
747        # but describes current behavior.
748        self.assertIsNone(re.search(r"\b", ""))
749        # A single word-character string has two boundaries, but no
750        # non-boundary gaps.
751        self.assertEqual(len(re.findall(r"\b", "a")), 2)
752        self.assertEqual(len(re.findall(r"\B", "a")), 0)
753        # If there are no words, there are no boundaries
754        self.assertEqual(len(re.findall(r"\b", " ")), 0)
755        self.assertEqual(len(re.findall(r"\b", "   ")), 0)
756        # Can match around the whitespace.
757        self.assertEqual(len(re.findall(r"\B", " ")), 2)
758
759    def test_bigcharset(self):
760        self.assertEqual(re.match("([\u2222\u2223])",
761                                  "\u2222").group(1), "\u2222")
762        r = '[%s]' % ''.join(map(chr, range(256, 2**16, 255)))
763        self.assertEqual(re.match(r, "\uff01").group(), "\uff01")
764
765    def test_big_codesize(self):
766        # Issue #1160
767        r = re.compile('|'.join(('%d'%x for x in range(10000))))
768        self.assertTrue(r.match('1000'))
769        self.assertTrue(r.match('9999'))
770
771    def test_anyall(self):
772        self.assertEqual(re.match("a.b", "a\nb", re.DOTALL).group(0),
773                         "a\nb")
774        self.assertEqual(re.match("a.*b", "a\n\nb", re.DOTALL).group(0),
775                         "a\n\nb")
776
777    def test_lookahead(self):
778        self.assertEqual(re.match(r"(a(?=\s[^a]))", "a b").group(1), "a")
779        self.assertEqual(re.match(r"(a(?=\s[^a]*))", "a b").group(1), "a")
780        self.assertEqual(re.match(r"(a(?=\s[abc]))", "a b").group(1), "a")
781        self.assertEqual(re.match(r"(a(?=\s[abc]*))", "a bc").group(1), "a")
782        self.assertEqual(re.match(r"(a)(?=\s\1)", "a a").group(1), "a")
783        self.assertEqual(re.match(r"(a)(?=\s\1*)", "a aa").group(1), "a")
784        self.assertEqual(re.match(r"(a)(?=\s(abc|a))", "a a").group(1), "a")
785
786        self.assertEqual(re.match(r"(a(?!\s[^a]))", "a a").group(1), "a")
787        self.assertEqual(re.match(r"(a(?!\s[abc]))", "a d").group(1), "a")
788        self.assertEqual(re.match(r"(a)(?!\s\1)", "a b").group(1), "a")
789        self.assertEqual(re.match(r"(a)(?!\s(abc|a))", "a b").group(1), "a")
790
791        # Group reference.
792        self.assertTrue(re.match(r'(a)b(?=\1)a', 'aba'))
793        self.assertIsNone(re.match(r'(a)b(?=\1)c', 'abac'))
794        # Conditional group reference.
795        self.assertTrue(re.match(r'(?:(a)|(x))b(?=(?(2)x|c))c', 'abc'))
796        self.assertIsNone(re.match(r'(?:(a)|(x))b(?=(?(2)c|x))c', 'abc'))
797        self.assertTrue(re.match(r'(?:(a)|(x))b(?=(?(2)x|c))c', 'abc'))
798        self.assertIsNone(re.match(r'(?:(a)|(x))b(?=(?(1)b|x))c', 'abc'))
799        self.assertTrue(re.match(r'(?:(a)|(x))b(?=(?(1)c|x))c', 'abc'))
800        # Group used before defined.
801        self.assertTrue(re.match(r'(a)b(?=(?(2)x|c))(c)', 'abc'))
802        self.assertIsNone(re.match(r'(a)b(?=(?(2)b|x))(c)', 'abc'))
803        self.assertTrue(re.match(r'(a)b(?=(?(1)c|x))(c)', 'abc'))
804
805    def test_lookbehind(self):
806        self.assertTrue(re.match(r'ab(?<=b)c', 'abc'))
807        self.assertIsNone(re.match(r'ab(?<=c)c', 'abc'))
808        self.assertIsNone(re.match(r'ab(?<!b)c', 'abc'))
809        self.assertTrue(re.match(r'ab(?<!c)c', 'abc'))
810        # Group reference.
811        self.assertTrue(re.match(r'(a)a(?<=\1)c', 'aac'))
812        self.assertIsNone(re.match(r'(a)b(?<=\1)a', 'abaa'))
813        self.assertIsNone(re.match(r'(a)a(?<!\1)c', 'aac'))
814        self.assertTrue(re.match(r'(a)b(?<!\1)a', 'abaa'))
815        # Conditional group reference.
816        self.assertIsNone(re.match(r'(?:(a)|(x))b(?<=(?(2)x|c))c', 'abc'))
817        self.assertIsNone(re.match(r'(?:(a)|(x))b(?<=(?(2)b|x))c', 'abc'))
818        self.assertTrue(re.match(r'(?:(a)|(x))b(?<=(?(2)x|b))c', 'abc'))
819        self.assertIsNone(re.match(r'(?:(a)|(x))b(?<=(?(1)c|x))c', 'abc'))
820        self.assertTrue(re.match(r'(?:(a)|(x))b(?<=(?(1)b|x))c', 'abc'))
821        # Group used before defined.
822        self.assertRaises(re.error, re.compile, r'(a)b(?<=(?(2)b|x))(c)')
823        self.assertIsNone(re.match(r'(a)b(?<=(?(1)c|x))(c)', 'abc'))
824        self.assertTrue(re.match(r'(a)b(?<=(?(1)b|x))(c)', 'abc'))
825        # Group defined in the same lookbehind pattern
826        self.assertRaises(re.error, re.compile, r'(a)b(?<=(.)\2)(c)')
827        self.assertRaises(re.error, re.compile, r'(a)b(?<=(?P<a>.)(?P=a))(c)')
828        self.assertRaises(re.error, re.compile, r'(a)b(?<=(a)(?(2)b|x))(c)')
829        self.assertRaises(re.error, re.compile, r'(a)b(?<=(.)(?<=\2))(c)')
830
831    def test_ignore_case(self):
832        self.assertEqual(re.match("abc", "ABC", re.I).group(0), "ABC")
833        self.assertEqual(re.match(b"abc", b"ABC", re.I).group(0), b"ABC")
834        self.assertEqual(re.match(r"(a\s[^a])", "a b", re.I).group(1), "a b")
835        self.assertEqual(re.match(r"(a\s[^a]*)", "a bb", re.I).group(1), "a bb")
836        self.assertEqual(re.match(r"(a\s[abc])", "a b", re.I).group(1), "a b")
837        self.assertEqual(re.match(r"(a\s[abc]*)", "a bb", re.I).group(1), "a bb")
838        self.assertEqual(re.match(r"((a)\s\2)", "a a", re.I).group(1), "a a")
839        self.assertEqual(re.match(r"((a)\s\2*)", "a aa", re.I).group(1), "a aa")
840        self.assertEqual(re.match(r"((a)\s(abc|a))", "a a", re.I).group(1), "a a")
841        self.assertEqual(re.match(r"((a)\s(abc|a)*)", "a aa", re.I).group(1), "a aa")
842
843        assert '\u212a'.lower() == 'k' # 'K'
844        self.assertTrue(re.match(r'K', '\u212a', re.I))
845        self.assertTrue(re.match(r'k', '\u212a', re.I))
846        self.assertTrue(re.match(r'\u212a', 'K', re.I))
847        self.assertTrue(re.match(r'\u212a', 'k', re.I))
848        assert '\u017f'.upper() == 'S' # 'ſ'
849        self.assertTrue(re.match(r'S', '\u017f', re.I))
850        self.assertTrue(re.match(r's', '\u017f', re.I))
851        self.assertTrue(re.match(r'\u017f', 'S', re.I))
852        self.assertTrue(re.match(r'\u017f', 's', re.I))
853        assert '\ufb05'.upper() == '\ufb06'.upper() == 'ST' # 'ſt', 'st'
854        self.assertTrue(re.match(r'\ufb05', '\ufb06', re.I))
855        self.assertTrue(re.match(r'\ufb06', '\ufb05', re.I))
856
857    def test_ignore_case_set(self):
858        self.assertTrue(re.match(r'[19A]', 'A', re.I))
859        self.assertTrue(re.match(r'[19a]', 'a', re.I))
860        self.assertTrue(re.match(r'[19a]', 'A', re.I))
861        self.assertTrue(re.match(r'[19A]', 'a', re.I))
862        self.assertTrue(re.match(br'[19A]', b'A', re.I))
863        self.assertTrue(re.match(br'[19a]', b'a', re.I))
864        self.assertTrue(re.match(br'[19a]', b'A', re.I))
865        self.assertTrue(re.match(br'[19A]', b'a', re.I))
866        assert '\u212a'.lower() == 'k' # 'K'
867        self.assertTrue(re.match(r'[19K]', '\u212a', re.I))
868        self.assertTrue(re.match(r'[19k]', '\u212a', re.I))
869        self.assertTrue(re.match(r'[19\u212a]', 'K', re.I))
870        self.assertTrue(re.match(r'[19\u212a]', 'k', re.I))
871        assert '\u017f'.upper() == 'S' # 'ſ'
872        self.assertTrue(re.match(r'[19S]', '\u017f', re.I))
873        self.assertTrue(re.match(r'[19s]', '\u017f', re.I))
874        self.assertTrue(re.match(r'[19\u017f]', 'S', re.I))
875        self.assertTrue(re.match(r'[19\u017f]', 's', re.I))
876        assert '\ufb05'.upper() == '\ufb06'.upper() == 'ST' # 'ſt', 'st'
877        self.assertTrue(re.match(r'[19\ufb05]', '\ufb06', re.I))
878        self.assertTrue(re.match(r'[19\ufb06]', '\ufb05', re.I))
879
880    def test_ignore_case_range(self):
881        # Issues #3511, #17381.
882        self.assertTrue(re.match(r'[9-a]', '_', re.I))
883        self.assertIsNone(re.match(r'[9-A]', '_', re.I))
884        self.assertTrue(re.match(br'[9-a]', b'_', re.I))
885        self.assertIsNone(re.match(br'[9-A]', b'_', re.I))
886        self.assertTrue(re.match(r'[\xc0-\xde]', '\xd7', re.I))
887        self.assertIsNone(re.match(r'[\xc0-\xde]', '\xf7', re.I))
888        self.assertTrue(re.match(r'[\xe0-\xfe]', '\xf7', re.I))
889        self.assertIsNone(re.match(r'[\xe0-\xfe]', '\xd7', re.I))
890        self.assertTrue(re.match(r'[\u0430-\u045f]', '\u0450', re.I))
891        self.assertTrue(re.match(r'[\u0430-\u045f]', '\u0400', re.I))
892        self.assertTrue(re.match(r'[\u0400-\u042f]', '\u0450', re.I))
893        self.assertTrue(re.match(r'[\u0400-\u042f]', '\u0400', re.I))
894        self.assertTrue(re.match(r'[\U00010428-\U0001044f]', '\U00010428', re.I))
895        self.assertTrue(re.match(r'[\U00010428-\U0001044f]', '\U00010400', re.I))
896        self.assertTrue(re.match(r'[\U00010400-\U00010427]', '\U00010428', re.I))
897        self.assertTrue(re.match(r'[\U00010400-\U00010427]', '\U00010400', re.I))
898
899        assert '\u212a'.lower() == 'k' # 'K'
900        self.assertTrue(re.match(r'[J-M]', '\u212a', re.I))
901        self.assertTrue(re.match(r'[j-m]', '\u212a', re.I))
902        self.assertTrue(re.match(r'[\u2129-\u212b]', 'K', re.I))
903        self.assertTrue(re.match(r'[\u2129-\u212b]', 'k', re.I))
904        assert '\u017f'.upper() == 'S' # 'ſ'
905        self.assertTrue(re.match(r'[R-T]', '\u017f', re.I))
906        self.assertTrue(re.match(r'[r-t]', '\u017f', re.I))
907        self.assertTrue(re.match(r'[\u017e-\u0180]', 'S', re.I))
908        self.assertTrue(re.match(r'[\u017e-\u0180]', 's', re.I))
909        assert '\ufb05'.upper() == '\ufb06'.upper() == 'ST' # 'ſt', 'st'
910        self.assertTrue(re.match(r'[\ufb04-\ufb05]', '\ufb06', re.I))
911        self.assertTrue(re.match(r'[\ufb06-\ufb07]', '\ufb05', re.I))
912
913    def test_category(self):
914        self.assertEqual(re.match(r"(\s)", " ").group(1), " ")
915
916    @cpython_only
917    def test_case_helpers(self):
918        import _sre
919        for i in range(128):
920            c = chr(i)
921            lo = ord(c.lower())
922            self.assertEqual(_sre.ascii_tolower(i), lo)
923            self.assertEqual(_sre.unicode_tolower(i), lo)
924            iscased = c in string.ascii_letters
925            self.assertEqual(_sre.ascii_iscased(i), iscased)
926            self.assertEqual(_sre.unicode_iscased(i), iscased)
927
928        for i in list(range(128, 0x1000)) + [0x10400, 0x10428]:
929            c = chr(i)
930            self.assertEqual(_sre.ascii_tolower(i), i)
931            if i != 0x0130:
932                self.assertEqual(_sre.unicode_tolower(i), ord(c.lower()))
933            iscased = c != c.lower() or c != c.upper()
934            self.assertFalse(_sre.ascii_iscased(i))
935            self.assertEqual(_sre.unicode_iscased(i),
936                             c != c.lower() or c != c.upper())
937
938        self.assertEqual(_sre.ascii_tolower(0x0130), 0x0130)
939        self.assertEqual(_sre.unicode_tolower(0x0130), ord('i'))
940        self.assertFalse(_sre.ascii_iscased(0x0130))
941        self.assertTrue(_sre.unicode_iscased(0x0130))
942
943    def test_not_literal(self):
944        self.assertEqual(re.search(r"\s([^a])", " b").group(1), "b")
945        self.assertEqual(re.search(r"\s([^a]*)", " bb").group(1), "bb")
946
947    def test_possible_set_operations(self):
948        s = bytes(range(128)).decode()
949        with self.assertWarns(FutureWarning):
950            p = re.compile(r'[0-9--1]')
951        self.assertEqual(p.findall(s), list('-./0123456789'))
952        self.assertEqual(re.findall(r'[--1]', s), list('-./01'))
953        with self.assertWarns(FutureWarning):
954            p = re.compile(r'[%--1]')
955        self.assertEqual(p.findall(s), list("%&'()*+,-1"))
956        with self.assertWarns(FutureWarning):
957            p = re.compile(r'[%--]')
958        self.assertEqual(p.findall(s), list("%&'()*+,-"))
959
960        with self.assertWarns(FutureWarning):
961            p = re.compile(r'[0-9&&1]')
962        self.assertEqual(p.findall(s), list('&0123456789'))
963        with self.assertWarns(FutureWarning):
964            p = re.compile(r'[\d&&1]')
965        self.assertEqual(p.findall(s), list('&0123456789'))
966        self.assertEqual(re.findall(r'[&&1]', s), list('&1'))
967
968        with self.assertWarns(FutureWarning):
969            p = re.compile(r'[0-9||a]')
970        self.assertEqual(p.findall(s), list('0123456789a|'))
971        with self.assertWarns(FutureWarning):
972            p = re.compile(r'[\d||a]')
973        self.assertEqual(p.findall(s), list('0123456789a|'))
974        self.assertEqual(re.findall(r'[||1]', s), list('1|'))
975
976        with self.assertWarns(FutureWarning):
977            p = re.compile(r'[0-9~~1]')
978        self.assertEqual(p.findall(s), list('0123456789~'))
979        with self.assertWarns(FutureWarning):
980            p = re.compile(r'[\d~~1]')
981        self.assertEqual(p.findall(s), list('0123456789~'))
982        self.assertEqual(re.findall(r'[~~1]', s), list('1~'))
983
984        with self.assertWarns(FutureWarning):
985            p = re.compile(r'[[0-9]|]')
986        self.assertEqual(p.findall(s), list('0123456789[]'))
987
988        with self.assertWarns(FutureWarning):
989            p = re.compile(r'[[:digit:]|]')
990        self.assertEqual(p.findall(s), list(':[]dgit'))
991
992    def test_search_coverage(self):
993        self.assertEqual(re.search(r"\s(b)", " b").group(1), "b")
994        self.assertEqual(re.search(r"a\s", "a ").group(0), "a ")
995
996    def assertMatch(self, pattern, text, match=None, span=None,
997                    matcher=re.fullmatch):
998        if match is None and span is None:
999            # the pattern matches the whole text
1000            match = text
1001            span = (0, len(text))
1002        elif match is None or span is None:
1003            raise ValueError('If match is not None, span should be specified '
1004                             '(and vice versa).')
1005        m = matcher(pattern, text)
1006        self.assertTrue(m)
1007        self.assertEqual(m.group(), match)
1008        self.assertEqual(m.span(), span)
1009
1010    LITERAL_CHARS = string.ascii_letters + string.digits + '!"%\',/:;<=>@_`'
1011
1012    def test_re_escape(self):
1013        p = ''.join(chr(i) for i in range(256))
1014        for c in p:
1015            self.assertMatch(re.escape(c), c)
1016            self.assertMatch('[' + re.escape(c) + ']', c)
1017            self.assertMatch('(?x)' + re.escape(c), c)
1018        self.assertMatch(re.escape(p), p)
1019        for c in '-.]{}':
1020            self.assertEqual(re.escape(c)[:1], '\\')
1021        literal_chars = self.LITERAL_CHARS
1022        self.assertEqual(re.escape(literal_chars), literal_chars)
1023
1024    def test_re_escape_bytes(self):
1025        p = bytes(range(256))
1026        for i in p:
1027            b = bytes([i])
1028            self.assertMatch(re.escape(b), b)
1029            self.assertMatch(b'[' + re.escape(b) + b']', b)
1030            self.assertMatch(b'(?x)' + re.escape(b), b)
1031        self.assertMatch(re.escape(p), p)
1032        for i in b'-.]{}':
1033            b = bytes([i])
1034            self.assertEqual(re.escape(b)[:1], b'\\')
1035        literal_chars = self.LITERAL_CHARS.encode('ascii')
1036        self.assertEqual(re.escape(literal_chars), literal_chars)
1037
1038    def test_re_escape_non_ascii(self):
1039        s = 'xxx\u2620\u2620\u2620xxx'
1040        s_escaped = re.escape(s)
1041        self.assertEqual(s_escaped, s)
1042        self.assertMatch(s_escaped, s)
1043        self.assertMatch('.%s+.' % re.escape('\u2620'), s,
1044                         'x\u2620\u2620\u2620x', (2, 7), re.search)
1045
1046    def test_re_escape_non_ascii_bytes(self):
1047        b = 'y\u2620y\u2620y'.encode('utf-8')
1048        b_escaped = re.escape(b)
1049        self.assertEqual(b_escaped, b)
1050        self.assertMatch(b_escaped, b)
1051        res = re.findall(re.escape('\u2620'.encode('utf-8')), b)
1052        self.assertEqual(len(res), 2)
1053
1054    def test_pickling(self):
1055        import pickle
1056        oldpat = re.compile('a(?:b|(c|e){1,2}?|d)+?(.)', re.UNICODE)
1057        for proto in range(pickle.HIGHEST_PROTOCOL + 1):
1058            pickled = pickle.dumps(oldpat, proto)
1059            newpat = pickle.loads(pickled)
1060            self.assertEqual(newpat, oldpat)
1061        # current pickle expects the _compile() reconstructor in re module
1062        from re import _compile
1063
1064    def test_copying(self):
1065        import copy
1066        p = re.compile(r'(?P<int>\d+)(?:\.(?P<frac>\d*))?')
1067        self.assertIs(copy.copy(p), p)
1068        self.assertIs(copy.deepcopy(p), p)
1069        m = p.match('12.34')
1070        self.assertIs(copy.copy(m), m)
1071        self.assertIs(copy.deepcopy(m), m)
1072
1073    def test_constants(self):
1074        self.assertEqual(re.I, re.IGNORECASE)
1075        self.assertEqual(re.L, re.LOCALE)
1076        self.assertEqual(re.M, re.MULTILINE)
1077        self.assertEqual(re.S, re.DOTALL)
1078        self.assertEqual(re.X, re.VERBOSE)
1079
1080    def test_flags(self):
1081        for flag in [re.I, re.M, re.X, re.S, re.A, re.U]:
1082            self.assertTrue(re.compile('^pattern$', flag))
1083        for flag in [re.I, re.M, re.X, re.S, re.A, re.L]:
1084            self.assertTrue(re.compile(b'^pattern$', flag))
1085
1086    def test_sre_character_literals(self):
1087        for i in [0, 8, 16, 32, 64, 127, 128, 255, 256, 0xFFFF, 0x10000, 0x10FFFF]:
1088            if i < 256:
1089                self.assertTrue(re.match(r"\%03o" % i, chr(i)))
1090                self.assertTrue(re.match(r"\%03o0" % i, chr(i)+"0"))
1091                self.assertTrue(re.match(r"\%03o8" % i, chr(i)+"8"))
1092                self.assertTrue(re.match(r"\x%02x" % i, chr(i)))
1093                self.assertTrue(re.match(r"\x%02x0" % i, chr(i)+"0"))
1094                self.assertTrue(re.match(r"\x%02xz" % i, chr(i)+"z"))
1095            if i < 0x10000:
1096                self.assertTrue(re.match(r"\u%04x" % i, chr(i)))
1097                self.assertTrue(re.match(r"\u%04x0" % i, chr(i)+"0"))
1098                self.assertTrue(re.match(r"\u%04xz" % i, chr(i)+"z"))
1099            self.assertTrue(re.match(r"\U%08x" % i, chr(i)))
1100            self.assertTrue(re.match(r"\U%08x0" % i, chr(i)+"0"))
1101            self.assertTrue(re.match(r"\U%08xz" % i, chr(i)+"z"))
1102        self.assertTrue(re.match(r"\0", "\000"))
1103        self.assertTrue(re.match(r"\08", "\0008"))
1104        self.assertTrue(re.match(r"\01", "\001"))
1105        self.assertTrue(re.match(r"\018", "\0018"))
1106        self.checkPatternError(r"\567",
1107                               r'octal escape value \567 outside of '
1108                               r'range 0-0o377', 0)
1109        self.checkPatternError(r"\911", 'invalid group reference 91', 1)
1110        self.checkPatternError(r"\x1", r'incomplete escape \x1', 0)
1111        self.checkPatternError(r"\x1z", r'incomplete escape \x1', 0)
1112        self.checkPatternError(r"\u123", r'incomplete escape \u123', 0)
1113        self.checkPatternError(r"\u123z", r'incomplete escape \u123', 0)
1114        self.checkPatternError(r"\U0001234", r'incomplete escape \U0001234', 0)
1115        self.checkPatternError(r"\U0001234z", r'incomplete escape \U0001234', 0)
1116        self.checkPatternError(r"\U00110000", r'bad escape \U00110000', 0)
1117
1118    def test_sre_character_class_literals(self):
1119        for i in [0, 8, 16, 32, 64, 127, 128, 255, 256, 0xFFFF, 0x10000, 0x10FFFF]:
1120            if i < 256:
1121                self.assertTrue(re.match(r"[\%o]" % i, chr(i)))
1122                self.assertTrue(re.match(r"[\%o8]" % i, chr(i)))
1123                self.assertTrue(re.match(r"[\%03o]" % i, chr(i)))
1124                self.assertTrue(re.match(r"[\%03o0]" % i, chr(i)))
1125                self.assertTrue(re.match(r"[\%03o8]" % i, chr(i)))
1126                self.assertTrue(re.match(r"[\x%02x]" % i, chr(i)))
1127                self.assertTrue(re.match(r"[\x%02x0]" % i, chr(i)))
1128                self.assertTrue(re.match(r"[\x%02xz]" % i, chr(i)))
1129            if i < 0x10000:
1130                self.assertTrue(re.match(r"[\u%04x]" % i, chr(i)))
1131                self.assertTrue(re.match(r"[\u%04x0]" % i, chr(i)))
1132                self.assertTrue(re.match(r"[\u%04xz]" % i, chr(i)))
1133            self.assertTrue(re.match(r"[\U%08x]" % i, chr(i)))
1134            self.assertTrue(re.match(r"[\U%08x0]" % i, chr(i)+"0"))
1135            self.assertTrue(re.match(r"[\U%08xz]" % i, chr(i)+"z"))
1136        self.checkPatternError(r"[\567]",
1137                               r'octal escape value \567 outside of '
1138                               r'range 0-0o377', 1)
1139        self.checkPatternError(r"[\911]", r'bad escape \9', 1)
1140        self.checkPatternError(r"[\x1z]", r'incomplete escape \x1', 1)
1141        self.checkPatternError(r"[\u123z]", r'incomplete escape \u123', 1)
1142        self.checkPatternError(r"[\U0001234z]", r'incomplete escape \U0001234', 1)
1143        self.checkPatternError(r"[\U00110000]", r'bad escape \U00110000', 1)
1144        self.assertTrue(re.match(r"[\U0001d49c-\U0001d4b5]", "\U0001d49e"))
1145
1146    def test_sre_byte_literals(self):
1147        for i in [0, 8, 16, 32, 64, 127, 128, 255]:
1148            self.assertTrue(re.match((r"\%03o" % i).encode(), bytes([i])))
1149            self.assertTrue(re.match((r"\%03o0" % i).encode(), bytes([i])+b"0"))
1150            self.assertTrue(re.match((r"\%03o8" % i).encode(), bytes([i])+b"8"))
1151            self.assertTrue(re.match((r"\x%02x" % i).encode(), bytes([i])))
1152            self.assertTrue(re.match((r"\x%02x0" % i).encode(), bytes([i])+b"0"))
1153            self.assertTrue(re.match((r"\x%02xz" % i).encode(), bytes([i])+b"z"))
1154        self.assertRaises(re.error, re.compile, br"\u1234")
1155        self.assertRaises(re.error, re.compile, br"\U00012345")
1156        self.assertTrue(re.match(br"\0", b"\000"))
1157        self.assertTrue(re.match(br"\08", b"\0008"))
1158        self.assertTrue(re.match(br"\01", b"\001"))
1159        self.assertTrue(re.match(br"\018", b"\0018"))
1160        self.checkPatternError(br"\567",
1161                               r'octal escape value \567 outside of '
1162                               r'range 0-0o377', 0)
1163        self.checkPatternError(br"\911", 'invalid group reference 91', 1)
1164        self.checkPatternError(br"\x1", r'incomplete escape \x1', 0)
1165        self.checkPatternError(br"\x1z", r'incomplete escape \x1', 0)
1166
1167    def test_sre_byte_class_literals(self):
1168        for i in [0, 8, 16, 32, 64, 127, 128, 255]:
1169            self.assertTrue(re.match((r"[\%o]" % i).encode(), bytes([i])))
1170            self.assertTrue(re.match((r"[\%o8]" % i).encode(), bytes([i])))
1171            self.assertTrue(re.match((r"[\%03o]" % i).encode(), bytes([i])))
1172            self.assertTrue(re.match((r"[\%03o0]" % i).encode(), bytes([i])))
1173            self.assertTrue(re.match((r"[\%03o8]" % i).encode(), bytes([i])))
1174            self.assertTrue(re.match((r"[\x%02x]" % i).encode(), bytes([i])))
1175            self.assertTrue(re.match((r"[\x%02x0]" % i).encode(), bytes([i])))
1176            self.assertTrue(re.match((r"[\x%02xz]" % i).encode(), bytes([i])))
1177        self.assertRaises(re.error, re.compile, br"[\u1234]")
1178        self.assertRaises(re.error, re.compile, br"[\U00012345]")
1179        self.checkPatternError(br"[\567]",
1180                               r'octal escape value \567 outside of '
1181                               r'range 0-0o377', 1)
1182        self.checkPatternError(br"[\911]", r'bad escape \9', 1)
1183        self.checkPatternError(br"[\x1z]", r'incomplete escape \x1', 1)
1184
1185    def test_character_set_errors(self):
1186        self.checkPatternError(r'[', 'unterminated character set', 0)
1187        self.checkPatternError(r'[^', 'unterminated character set', 0)
1188        self.checkPatternError(r'[a', 'unterminated character set', 0)
1189        # bug 545855 -- This pattern failed to cause a compile error as it
1190        # should, instead provoking a TypeError.
1191        self.checkPatternError(r"[a-", 'unterminated character set', 0)
1192        self.checkPatternError(r"[\w-b]", r'bad character range \w-b', 1)
1193        self.checkPatternError(r"[a-\w]", r'bad character range a-\w', 1)
1194        self.checkPatternError(r"[b-a]", 'bad character range b-a', 1)
1195
1196    def test_bug_113254(self):
1197        self.assertEqual(re.match(r'(a)|(b)', 'b').start(1), -1)
1198        self.assertEqual(re.match(r'(a)|(b)', 'b').end(1), -1)
1199        self.assertEqual(re.match(r'(a)|(b)', 'b').span(1), (-1, -1))
1200
1201    def test_bug_527371(self):
1202        # bug described in patches 527371/672491
1203        self.assertIsNone(re.match(r'(a)?a','a').lastindex)
1204        self.assertEqual(re.match(r'(a)(b)?b','ab').lastindex, 1)
1205        self.assertEqual(re.match(r'(?P<a>a)(?P<b>b)?b','ab').lastgroup, 'a')
1206        self.assertEqual(re.match(r"(?P<a>a(b))", "ab").lastgroup, 'a')
1207        self.assertEqual(re.match(r"((a))", "a").lastindex, 1)
1208
1209    def test_bug_418626(self):
1210        # bugs 418626 at al. -- Testing Greg Chapman's addition of op code
1211        # SRE_OP_MIN_REPEAT_ONE for eliminating recursion on simple uses of
1212        # pattern '*?' on a long string.
1213        self.assertEqual(re.match('.*?c', 10000*'ab'+'cd').end(0), 20001)
1214        self.assertEqual(re.match('.*?cd', 5000*'ab'+'c'+5000*'ab'+'cde').end(0),
1215                         20003)
1216        self.assertEqual(re.match('.*?cd', 20000*'abc'+'de').end(0), 60001)
1217        # non-simple '*?' still used to hit the recursion limit, before the
1218        # non-recursive scheme was implemented.
1219        self.assertEqual(re.search('(a|b)*?c', 10000*'ab'+'cd').end(0), 20001)
1220
1221    def test_bug_612074(self):
1222        pat="["+re.escape("\u2039")+"]"
1223        self.assertEqual(re.compile(pat) and 1, 1)
1224
1225    def test_stack_overflow(self):
1226        # nasty cases that used to overflow the straightforward recursive
1227        # implementation of repeated groups.
1228        self.assertEqual(re.match('(x)*', 50000*'x').group(1), 'x')
1229        self.assertEqual(re.match('(x)*y', 50000*'x'+'y').group(1), 'x')
1230        self.assertEqual(re.match('(x)*?y', 50000*'x'+'y').group(1), 'x')
1231
1232    def test_nothing_to_repeat(self):
1233        for reps in '*', '+', '?', '{1,2}':
1234            for mod in '', '?':
1235                self.checkPatternError('%s%s' % (reps, mod),
1236                                       'nothing to repeat', 0)
1237                self.checkPatternError('(?:%s%s)' % (reps, mod),
1238                                       'nothing to repeat', 3)
1239
1240    def test_multiple_repeat(self):
1241        for outer_reps in '*', '+', '{1,2}':
1242            for outer_mod in '', '?':
1243                outer_op = outer_reps + outer_mod
1244                for inner_reps in '*', '+', '?', '{1,2}':
1245                    for inner_mod in '', '?':
1246                        inner_op = inner_reps + inner_mod
1247                        self.checkPatternError(r'x%s%s' % (inner_op, outer_op),
1248                                'multiple repeat', 1 + len(inner_op))
1249
1250    def test_unlimited_zero_width_repeat(self):
1251        # Issue #9669
1252        self.assertIsNone(re.match(r'(?:a?)*y', 'z'))
1253        self.assertIsNone(re.match(r'(?:a?)+y', 'z'))
1254        self.assertIsNone(re.match(r'(?:a?){2,}y', 'z'))
1255        self.assertIsNone(re.match(r'(?:a?)*?y', 'z'))
1256        self.assertIsNone(re.match(r'(?:a?)+?y', 'z'))
1257        self.assertIsNone(re.match(r'(?:a?){2,}?y', 'z'))
1258
1259    def test_scanner(self):
1260        def s_ident(scanner, token): return token
1261        def s_operator(scanner, token): return "op%s" % token
1262        def s_float(scanner, token): return float(token)
1263        def s_int(scanner, token): return int(token)
1264
1265        scanner = Scanner([
1266            (r"[a-zA-Z_]\w*", s_ident),
1267            (r"\d+\.\d*", s_float),
1268            (r"\d+", s_int),
1269            (r"=|\+|-|\*|/", s_operator),
1270            (r"\s+", None),
1271            ])
1272
1273        self.assertTrue(scanner.scanner.scanner("").pattern)
1274
1275        self.assertEqual(scanner.scan("sum = 3*foo + 312.50 + bar"),
1276                         (['sum', 'op=', 3, 'op*', 'foo', 'op+', 312.5,
1277                           'op+', 'bar'], ''))
1278
1279    def test_bug_448951(self):
1280        # bug 448951 (similar to 429357, but with single char match)
1281        # (Also test greedy matches.)
1282        for op in '','?','*':
1283            self.assertEqual(re.match(r'((.%s):)?z'%op, 'z').groups(),
1284                             (None, None))
1285            self.assertEqual(re.match(r'((.%s):)?z'%op, 'a:z').groups(),
1286                             ('a:', 'a'))
1287
1288    def test_bug_725106(self):
1289        # capturing groups in alternatives in repeats
1290        self.assertEqual(re.match('^((a)|b)*', 'abc').groups(),
1291                         ('b', 'a'))
1292        self.assertEqual(re.match('^(([ab])|c)*', 'abc').groups(),
1293                         ('c', 'b'))
1294        self.assertEqual(re.match('^((d)|[ab])*', 'abc').groups(),
1295                         ('b', None))
1296        self.assertEqual(re.match('^((a)c|[ab])*', 'abc').groups(),
1297                         ('b', None))
1298        self.assertEqual(re.match('^((a)|b)*?c', 'abc').groups(),
1299                         ('b', 'a'))
1300        self.assertEqual(re.match('^(([ab])|c)*?d', 'abcd').groups(),
1301                         ('c', 'b'))
1302        self.assertEqual(re.match('^((d)|[ab])*?c', 'abc').groups(),
1303                         ('b', None))
1304        self.assertEqual(re.match('^((a)c|[ab])*?c', 'abc').groups(),
1305                         ('b', None))
1306
1307    def test_bug_725149(self):
1308        # mark_stack_base restoring before restoring marks
1309        self.assertEqual(re.match('(a)(?:(?=(b)*)c)*', 'abb').groups(),
1310                         ('a', None))
1311        self.assertEqual(re.match('(a)((?!(b)*))*', 'abb').groups(),
1312                         ('a', None, None))
1313
1314    def test_bug_764548(self):
1315        # bug 764548, re.compile() barfs on str/unicode subclasses
1316        class my_unicode(str): pass
1317        pat = re.compile(my_unicode("abc"))
1318        self.assertIsNone(pat.match("xyz"))
1319
1320    def test_finditer(self):
1321        iter = re.finditer(r":+", "a:b::c:::d")
1322        self.assertEqual([item.group(0) for item in iter],
1323                         [":", "::", ":::"])
1324
1325        pat = re.compile(r":+")
1326        iter = pat.finditer("a:b::c:::d", 1, 10)
1327        self.assertEqual([item.group(0) for item in iter],
1328                         [":", "::", ":::"])
1329
1330        pat = re.compile(r":+")
1331        iter = pat.finditer("a:b::c:::d", pos=1, endpos=10)
1332        self.assertEqual([item.group(0) for item in iter],
1333                         [":", "::", ":::"])
1334
1335        pat = re.compile(r":+")
1336        iter = pat.finditer("a:b::c:::d", endpos=10, pos=1)
1337        self.assertEqual([item.group(0) for item in iter],
1338                         [":", "::", ":::"])
1339
1340        pat = re.compile(r":+")
1341        iter = pat.finditer("a:b::c:::d", pos=3, endpos=8)
1342        self.assertEqual([item.group(0) for item in iter],
1343                         ["::", "::"])
1344
1345    def test_bug_926075(self):
1346        self.assertIsNot(re.compile('bug_926075'),
1347                         re.compile(b'bug_926075'))
1348
1349    def test_bug_931848(self):
1350        pattern = "[\u002E\u3002\uFF0E\uFF61]"
1351        self.assertEqual(re.compile(pattern).split("a.b.c"),
1352                         ['a','b','c'])
1353
1354    def test_bug_581080(self):
1355        iter = re.finditer(r"\s", "a b")
1356        self.assertEqual(next(iter).span(), (1,2))
1357        self.assertRaises(StopIteration, next, iter)
1358
1359        scanner = re.compile(r"\s").scanner("a b")
1360        self.assertEqual(scanner.search().span(), (1, 2))
1361        self.assertIsNone(scanner.search())
1362
1363    def test_bug_817234(self):
1364        iter = re.finditer(r".*", "asdf")
1365        self.assertEqual(next(iter).span(), (0, 4))
1366        self.assertEqual(next(iter).span(), (4, 4))
1367        self.assertRaises(StopIteration, next, iter)
1368
1369    def test_bug_6561(self):
1370        # '\d' should match characters in Unicode category 'Nd'
1371        # (Number, Decimal Digit), but not those in 'Nl' (Number,
1372        # Letter) or 'No' (Number, Other).
1373        decimal_digits = [
1374            '\u0037', # '\N{DIGIT SEVEN}', category 'Nd'
1375            '\u0e58', # '\N{THAI DIGIT SIX}', category 'Nd'
1376            '\uff10', # '\N{FULLWIDTH DIGIT ZERO}', category 'Nd'
1377            ]
1378        for x in decimal_digits:
1379            self.assertEqual(re.match(r'^\d$', x).group(0), x)
1380
1381        not_decimal_digits = [
1382            '\u2165', # '\N{ROMAN NUMERAL SIX}', category 'Nl'
1383            '\u3039', # '\N{HANGZHOU NUMERAL TWENTY}', category 'Nl'
1384            '\u2082', # '\N{SUBSCRIPT TWO}', category 'No'
1385            '\u32b4', # '\N{CIRCLED NUMBER THIRTY NINE}', category 'No'
1386            ]
1387        for x in not_decimal_digits:
1388            self.assertIsNone(re.match(r'^\d$', x))
1389
1390    def test_empty_array(self):
1391        # SF buf 1647541
1392        import array
1393        for typecode in 'bBuhHiIlLfd':
1394            a = array.array(typecode)
1395            self.assertIsNone(re.compile(b"bla").match(a))
1396            self.assertEqual(re.compile(b"").match(a).groups(), ())
1397
1398    def test_inline_flags(self):
1399        # Bug #1700
1400        upper_char = '\u1ea0' # Latin Capital Letter A with Dot Below
1401        lower_char = '\u1ea1' # Latin Small Letter A with Dot Below
1402
1403        p = re.compile('.' + upper_char, re.I | re.S)
1404        q = p.match('\n' + lower_char)
1405        self.assertTrue(q)
1406
1407        p = re.compile('.' + lower_char, re.I | re.S)
1408        q = p.match('\n' + upper_char)
1409        self.assertTrue(q)
1410
1411        p = re.compile('(?i).' + upper_char, re.S)
1412        q = p.match('\n' + lower_char)
1413        self.assertTrue(q)
1414
1415        p = re.compile('(?i).' + lower_char, re.S)
1416        q = p.match('\n' + upper_char)
1417        self.assertTrue(q)
1418
1419        p = re.compile('(?is).' + upper_char)
1420        q = p.match('\n' + lower_char)
1421        self.assertTrue(q)
1422
1423        p = re.compile('(?is).' + lower_char)
1424        q = p.match('\n' + upper_char)
1425        self.assertTrue(q)
1426
1427        p = re.compile('(?s)(?i).' + upper_char)
1428        q = p.match('\n' + lower_char)
1429        self.assertTrue(q)
1430
1431        p = re.compile('(?s)(?i).' + lower_char)
1432        q = p.match('\n' + upper_char)
1433        self.assertTrue(q)
1434
1435        self.assertTrue(re.match('(?ix) ' + upper_char, lower_char))
1436        self.assertTrue(re.match('(?ix) ' + lower_char, upper_char))
1437        self.assertTrue(re.match(' (?i) ' + upper_char, lower_char, re.X))
1438        self.assertTrue(re.match('(?x) (?i) ' + upper_char, lower_char))
1439        self.assertTrue(re.match(' (?x) (?i) ' + upper_char, lower_char, re.X))
1440
1441        p = upper_char + '(?i)'
1442        with self.assertWarns(DeprecationWarning) as warns:
1443            self.assertTrue(re.match(p, lower_char))
1444        self.assertEqual(
1445            str(warns.warnings[0].message),
1446            'Flags not at the start of the expression %r' % p
1447        )
1448        self.assertEqual(warns.warnings[0].filename, __file__)
1449
1450        p = upper_char + '(?i)%s' % ('.?' * 100)
1451        with self.assertWarns(DeprecationWarning) as warns:
1452            self.assertTrue(re.match(p, lower_char))
1453        self.assertEqual(
1454            str(warns.warnings[0].message),
1455            'Flags not at the start of the expression %r (truncated)' % p[:20]
1456        )
1457        self.assertEqual(warns.warnings[0].filename, __file__)
1458
1459        # bpo-30605: Compiling a bytes instance regex was throwing a BytesWarning
1460        with warnings.catch_warnings():
1461            warnings.simplefilter('error', BytesWarning)
1462            p = b'A(?i)'
1463            with self.assertWarns(DeprecationWarning) as warns:
1464                self.assertTrue(re.match(p, b'a'))
1465            self.assertEqual(
1466                str(warns.warnings[0].message),
1467                'Flags not at the start of the expression %r' % p
1468            )
1469            self.assertEqual(warns.warnings[0].filename, __file__)
1470
1471        with self.assertWarns(DeprecationWarning):
1472            self.assertTrue(re.match('(?s).(?i)' + upper_char, '\n' + lower_char))
1473        with self.assertWarns(DeprecationWarning):
1474            self.assertTrue(re.match('(?i) ' + upper_char + ' (?x)', lower_char))
1475        with self.assertWarns(DeprecationWarning):
1476            self.assertTrue(re.match(' (?x) (?i) ' + upper_char, lower_char))
1477        with self.assertWarns(DeprecationWarning):
1478            self.assertTrue(re.match('^(?i)' + upper_char, lower_char))
1479        with self.assertWarns(DeprecationWarning):
1480            self.assertTrue(re.match('$|(?i)' + upper_char, lower_char))
1481        with self.assertWarns(DeprecationWarning) as warns:
1482            self.assertTrue(re.match('(?:(?i)' + upper_char + ')', lower_char))
1483        self.assertRegex(str(warns.warnings[0].message),
1484                         'Flags not at the start')
1485        self.assertEqual(warns.warnings[0].filename, __file__)
1486        with self.assertWarns(DeprecationWarning) as warns:
1487            self.assertTrue(re.fullmatch('(^)?(?(1)(?i)' + upper_char + ')',
1488                                         lower_char))
1489        self.assertRegex(str(warns.warnings[0].message),
1490                         'Flags not at the start')
1491        self.assertEqual(warns.warnings[0].filename, __file__)
1492        with self.assertWarns(DeprecationWarning) as warns:
1493            self.assertTrue(re.fullmatch('($)?(?(1)|(?i)' + upper_char + ')',
1494                                         lower_char))
1495        self.assertRegex(str(warns.warnings[0].message),
1496                         'Flags not at the start')
1497        self.assertEqual(warns.warnings[0].filename, __file__)
1498
1499
1500    def test_dollar_matches_twice(self):
1501        "$ matches the end of string, and just before the terminating \n"
1502        pattern = re.compile('$')
1503        self.assertEqual(pattern.sub('#', 'a\nb\n'), 'a\nb#\n#')
1504        self.assertEqual(pattern.sub('#', 'a\nb\nc'), 'a\nb\nc#')
1505        self.assertEqual(pattern.sub('#', '\n'), '#\n#')
1506
1507        pattern = re.compile('$', re.MULTILINE)
1508        self.assertEqual(pattern.sub('#', 'a\nb\n' ), 'a#\nb#\n#' )
1509        self.assertEqual(pattern.sub('#', 'a\nb\nc'), 'a#\nb#\nc#')
1510        self.assertEqual(pattern.sub('#', '\n'), '#\n#')
1511
1512    def test_bytes_str_mixing(self):
1513        # Mixing str and bytes is disallowed
1514        pat = re.compile('.')
1515        bpat = re.compile(b'.')
1516        self.assertRaises(TypeError, pat.match, b'b')
1517        self.assertRaises(TypeError, bpat.match, 'b')
1518        self.assertRaises(TypeError, pat.sub, b'b', 'c')
1519        self.assertRaises(TypeError, pat.sub, 'b', b'c')
1520        self.assertRaises(TypeError, pat.sub, b'b', b'c')
1521        self.assertRaises(TypeError, bpat.sub, b'b', 'c')
1522        self.assertRaises(TypeError, bpat.sub, 'b', b'c')
1523        self.assertRaises(TypeError, bpat.sub, 'b', 'c')
1524
1525    def test_ascii_and_unicode_flag(self):
1526        # String patterns
1527        for flags in (0, re.UNICODE):
1528            pat = re.compile('\xc0', flags | re.IGNORECASE)
1529            self.assertTrue(pat.match('\xe0'))
1530            pat = re.compile(r'\w', flags)
1531            self.assertTrue(pat.match('\xe0'))
1532        pat = re.compile('\xc0', re.ASCII | re.IGNORECASE)
1533        self.assertIsNone(pat.match('\xe0'))
1534        pat = re.compile('(?a)\xc0', re.IGNORECASE)
1535        self.assertIsNone(pat.match('\xe0'))
1536        pat = re.compile(r'\w', re.ASCII)
1537        self.assertIsNone(pat.match('\xe0'))
1538        pat = re.compile(r'(?a)\w')
1539        self.assertIsNone(pat.match('\xe0'))
1540        # Bytes patterns
1541        for flags in (0, re.ASCII):
1542            pat = re.compile(b'\xc0', flags | re.IGNORECASE)
1543            self.assertIsNone(pat.match(b'\xe0'))
1544            pat = re.compile(br'\w', flags)
1545            self.assertIsNone(pat.match(b'\xe0'))
1546        # Incompatibilities
1547        self.assertRaises(ValueError, re.compile, br'\w', re.UNICODE)
1548        self.assertRaises(re.error, re.compile, br'(?u)\w')
1549        self.assertRaises(ValueError, re.compile, r'\w', re.UNICODE | re.ASCII)
1550        self.assertRaises(ValueError, re.compile, r'(?u)\w', re.ASCII)
1551        self.assertRaises(ValueError, re.compile, r'(?a)\w', re.UNICODE)
1552        self.assertRaises(re.error, re.compile, r'(?au)\w')
1553
1554    def test_locale_flag(self):
1555        enc = locale.getpreferredencoding()
1556        # Search non-ASCII letter
1557        for i in range(128, 256):
1558            try:
1559                c = bytes([i]).decode(enc)
1560                sletter = c.lower()
1561                if sletter == c: continue
1562                bletter = sletter.encode(enc)
1563                if len(bletter) != 1: continue
1564                if bletter.decode(enc) != sletter: continue
1565                bpat = re.escape(bytes([i]))
1566                break
1567            except (UnicodeError, TypeError):
1568                pass
1569        else:
1570            bletter = None
1571            bpat = b'A'
1572        # Bytes patterns
1573        pat = re.compile(bpat, re.LOCALE | re.IGNORECASE)
1574        if bletter:
1575            self.assertTrue(pat.match(bletter))
1576        pat = re.compile(b'(?L)' + bpat, re.IGNORECASE)
1577        if bletter:
1578            self.assertTrue(pat.match(bletter))
1579        pat = re.compile(bpat, re.IGNORECASE)
1580        if bletter:
1581            self.assertIsNone(pat.match(bletter))
1582        pat = re.compile(br'\w', re.LOCALE)
1583        if bletter:
1584            self.assertTrue(pat.match(bletter))
1585        pat = re.compile(br'(?L)\w')
1586        if bletter:
1587            self.assertTrue(pat.match(bletter))
1588        pat = re.compile(br'\w')
1589        if bletter:
1590            self.assertIsNone(pat.match(bletter))
1591        # Incompatibilities
1592        self.assertRaises(ValueError, re.compile, '', re.LOCALE)
1593        self.assertRaises(re.error, re.compile, '(?L)')
1594        self.assertRaises(ValueError, re.compile, b'', re.LOCALE | re.ASCII)
1595        self.assertRaises(ValueError, re.compile, b'(?L)', re.ASCII)
1596        self.assertRaises(ValueError, re.compile, b'(?a)', re.LOCALE)
1597        self.assertRaises(re.error, re.compile, b'(?aL)')
1598
1599    def test_scoped_flags(self):
1600        self.assertTrue(re.match(r'(?i:a)b', 'Ab'))
1601        self.assertIsNone(re.match(r'(?i:a)b', 'aB'))
1602        self.assertIsNone(re.match(r'(?-i:a)b', 'Ab', re.IGNORECASE))
1603        self.assertTrue(re.match(r'(?-i:a)b', 'aB', re.IGNORECASE))
1604        self.assertIsNone(re.match(r'(?i:(?-i:a)b)', 'Ab'))
1605        self.assertTrue(re.match(r'(?i:(?-i:a)b)', 'aB'))
1606
1607        self.assertTrue(re.match(r'(?x: a) b', 'a b'))
1608        self.assertIsNone(re.match(r'(?x: a) b', ' a b'))
1609        self.assertTrue(re.match(r'(?-x: a) b', ' ab', re.VERBOSE))
1610        self.assertIsNone(re.match(r'(?-x: a) b', 'ab', re.VERBOSE))
1611
1612        self.assertTrue(re.match(r'\w(?a:\W)\w', '\xe0\xe0\xe0'))
1613        self.assertTrue(re.match(r'(?a:\W(?u:\w)\W)', '\xe0\xe0\xe0'))
1614        self.assertTrue(re.match(r'\W(?u:\w)\W', '\xe0\xe0\xe0', re.ASCII))
1615
1616        self.checkPatternError(r'(?a)(?-a:\w)',
1617                "bad inline flags: cannot turn off flags 'a', 'u' and 'L'", 8)
1618        self.checkPatternError(r'(?i-i:a)',
1619                'bad inline flags: flag turned on and off', 5)
1620        self.checkPatternError(r'(?au:a)',
1621                "bad inline flags: flags 'a', 'u' and 'L' are incompatible", 4)
1622        self.checkPatternError(br'(?aL:a)',
1623                "bad inline flags: flags 'a', 'u' and 'L' are incompatible", 4)
1624
1625        self.checkPatternError(r'(?-', 'missing flag', 3)
1626        self.checkPatternError(r'(?-+', 'missing flag', 3)
1627        self.checkPatternError(r'(?-z', 'unknown flag', 3)
1628        self.checkPatternError(r'(?-i', 'missing :', 4)
1629        self.checkPatternError(r'(?-i)', 'missing :', 4)
1630        self.checkPatternError(r'(?-i+', 'missing :', 4)
1631        self.checkPatternError(r'(?-iz', 'unknown flag', 4)
1632        self.checkPatternError(r'(?i:', 'missing ), unterminated subpattern', 0)
1633        self.checkPatternError(r'(?i', 'missing -, : or )', 3)
1634        self.checkPatternError(r'(?i+', 'missing -, : or )', 3)
1635        self.checkPatternError(r'(?iz', 'unknown flag', 3)
1636
1637    def test_bug_6509(self):
1638        # Replacement strings of both types must parse properly.
1639        # all strings
1640        pat = re.compile(r'a(\w)')
1641        self.assertEqual(pat.sub('b\\1', 'ac'), 'bc')
1642        pat = re.compile('a(.)')
1643        self.assertEqual(pat.sub('b\\1', 'a\u1234'), 'b\u1234')
1644        pat = re.compile('..')
1645        self.assertEqual(pat.sub(lambda m: 'str', 'a5'), 'str')
1646
1647        # all bytes
1648        pat = re.compile(br'a(\w)')
1649        self.assertEqual(pat.sub(b'b\\1', b'ac'), b'bc')
1650        pat = re.compile(b'a(.)')
1651        self.assertEqual(pat.sub(b'b\\1', b'a\xCD'), b'b\xCD')
1652        pat = re.compile(b'..')
1653        self.assertEqual(pat.sub(lambda m: b'bytes', b'a5'), b'bytes')
1654
1655    def test_dealloc(self):
1656        # issue 3299: check for segfault in debug build
1657        import _sre
1658        # the overflow limit is different on wide and narrow builds and it
1659        # depends on the definition of SRE_CODE (see sre.h).
1660        # 2**128 should be big enough to overflow on both. For smaller values
1661        # a RuntimeError is raised instead of OverflowError.
1662        long_overflow = 2**128
1663        self.assertRaises(TypeError, re.finditer, "a", {})
1664        with self.assertRaises(OverflowError):
1665            _sre.compile("abc", 0, [long_overflow], 0, {}, ())
1666        with self.assertRaises(TypeError):
1667            _sre.compile({}, 0, [], 0, [], [])
1668
1669    def test_search_dot_unicode(self):
1670        self.assertTrue(re.search("123.*-", '123abc-'))
1671        self.assertTrue(re.search("123.*-", '123\xe9-'))
1672        self.assertTrue(re.search("123.*-", '123\u20ac-'))
1673        self.assertTrue(re.search("123.*-", '123\U0010ffff-'))
1674        self.assertTrue(re.search("123.*-", '123\xe9\u20ac\U0010ffff-'))
1675
1676    def test_compile(self):
1677        # Test return value when given string and pattern as parameter
1678        pattern = re.compile('random pattern')
1679        self.assertIsInstance(pattern, re.Pattern)
1680        same_pattern = re.compile(pattern)
1681        self.assertIsInstance(same_pattern, re.Pattern)
1682        self.assertIs(same_pattern, pattern)
1683        # Test behaviour when not given a string or pattern as parameter
1684        self.assertRaises(TypeError, re.compile, 0)
1685
1686    @bigmemtest(size=_2G, memuse=1)
1687    def test_large_search(self, size):
1688        # Issue #10182: indices were 32-bit-truncated.
1689        s = 'a' * size
1690        m = re.search('$', s)
1691        self.assertIsNotNone(m)
1692        self.assertEqual(m.start(), size)
1693        self.assertEqual(m.end(), size)
1694
1695    # The huge memuse is because of re.sub() using a list and a join()
1696    # to create the replacement result.
1697    @bigmemtest(size=_2G, memuse=16 + 2)
1698    def test_large_subn(self, size):
1699        # Issue #10182: indices were 32-bit-truncated.
1700        s = 'a' * size
1701        r, n = re.subn('', '', s)
1702        self.assertEqual(r, s)
1703        self.assertEqual(n, size + 1)
1704
1705    def test_bug_16688(self):
1706        # Issue 16688: Backreferences make case-insensitive regex fail on
1707        # non-ASCII strings.
1708        self.assertEqual(re.findall(r"(?i)(a)\1", "aa \u0100"), ['a'])
1709        self.assertEqual(re.match(r"(?s).{1,3}", "\u0100\u0100").span(), (0, 2))
1710
1711    def test_repeat_minmax_overflow(self):
1712        # Issue #13169
1713        string = "x" * 100000
1714        self.assertEqual(re.match(r".{65535}", string).span(), (0, 65535))
1715        self.assertEqual(re.match(r".{,65535}", string).span(), (0, 65535))
1716        self.assertEqual(re.match(r".{65535,}?", string).span(), (0, 65535))
1717        self.assertEqual(re.match(r".{65536}", string).span(), (0, 65536))
1718        self.assertEqual(re.match(r".{,65536}", string).span(), (0, 65536))
1719        self.assertEqual(re.match(r".{65536,}?", string).span(), (0, 65536))
1720        # 2**128 should be big enough to overflow both SRE_CODE and Py_ssize_t.
1721        self.assertRaises(OverflowError, re.compile, r".{%d}" % 2**128)
1722        self.assertRaises(OverflowError, re.compile, r".{,%d}" % 2**128)
1723        self.assertRaises(OverflowError, re.compile, r".{%d,}?" % 2**128)
1724        self.assertRaises(OverflowError, re.compile, r".{%d,%d}" % (2**129, 2**128))
1725
1726    @cpython_only
1727    def test_repeat_minmax_overflow_maxrepeat(self):
1728        try:
1729            from _sre import MAXREPEAT
1730        except ImportError:
1731            self.skipTest('requires _sre.MAXREPEAT constant')
1732        string = "x" * 100000
1733        self.assertIsNone(re.match(r".{%d}" % (MAXREPEAT - 1), string))
1734        self.assertEqual(re.match(r".{,%d}" % (MAXREPEAT - 1), string).span(),
1735                         (0, 100000))
1736        self.assertIsNone(re.match(r".{%d,}?" % (MAXREPEAT - 1), string))
1737        self.assertRaises(OverflowError, re.compile, r".{%d}" % MAXREPEAT)
1738        self.assertRaises(OverflowError, re.compile, r".{,%d}" % MAXREPEAT)
1739        self.assertRaises(OverflowError, re.compile, r".{%d,}?" % MAXREPEAT)
1740
1741    def test_backref_group_name_in_exception(self):
1742        # Issue 17341: Poor error message when compiling invalid regex
1743        self.checkPatternError('(?P=<foo>)',
1744                               "bad character in group name '<foo>'", 4)
1745
1746    def test_group_name_in_exception(self):
1747        # Issue 17341: Poor error message when compiling invalid regex
1748        self.checkPatternError('(?P<?foo>)',
1749                               "bad character in group name '?foo'", 4)
1750
1751    def test_issue17998(self):
1752        for reps in '*', '+', '?', '{1}':
1753            for mod in '', '?':
1754                pattern = '.' + reps + mod + 'yz'
1755                self.assertEqual(re.compile(pattern, re.S).findall('xyz'),
1756                                 ['xyz'], msg=pattern)
1757                pattern = pattern.encode()
1758                self.assertEqual(re.compile(pattern, re.S).findall(b'xyz'),
1759                                 [b'xyz'], msg=pattern)
1760
1761    def test_match_repr(self):
1762        for string in '[abracadabra]', S('[abracadabra]'):
1763            m = re.search(r'(.+)(.*?)\1', string)
1764            pattern = r"<(%s\.)?%s object; span=\(1, 12\), match='abracadabra'>" % (
1765                type(m).__module__, type(m).__qualname__
1766            )
1767            self.assertRegex(repr(m), pattern)
1768        for string in (b'[abracadabra]', B(b'[abracadabra]'),
1769                       bytearray(b'[abracadabra]'),
1770                       memoryview(b'[abracadabra]')):
1771            m = re.search(br'(.+)(.*?)\1', string)
1772            pattern = r"<(%s\.)?%s object; span=\(1, 12\), match=b'abracadabra'>" % (
1773                type(m).__module__, type(m).__qualname__
1774            )
1775            self.assertRegex(repr(m), pattern)
1776
1777        first, second = list(re.finditer("(aa)|(bb)", "aa bb"))
1778        pattern = r"<(%s\.)?%s object; span=\(0, 2\), match='aa'>" % (
1779            type(second).__module__, type(second).__qualname__
1780        )
1781        self.assertRegex(repr(first), pattern)
1782        pattern = r"<(%s\.)?%s object; span=\(3, 5\), match='bb'>" % (
1783            type(second).__module__, type(second).__qualname__
1784        )
1785        self.assertRegex(repr(second), pattern)
1786
1787    def test_zerowidth(self):
1788        # Issues 852532, 1647489, 3262, 25054.
1789        self.assertEqual(re.split(r"\b", "a::bc"), ['', 'a', '::', 'bc', ''])
1790        self.assertEqual(re.split(r"\b|:+", "a::bc"), ['', 'a', '', '', 'bc', ''])
1791        self.assertEqual(re.split(r"(?<!\w)(?=\w)|:+", "a::bc"), ['', 'a', '', 'bc'])
1792        self.assertEqual(re.split(r"(?<=\w)(?!\w)|:+", "a::bc"), ['a', '', 'bc', ''])
1793
1794        self.assertEqual(re.sub(r"\b", "-", "a::bc"), '-a-::-bc-')
1795        self.assertEqual(re.sub(r"\b|:+", "-", "a::bc"), '-a---bc-')
1796        self.assertEqual(re.sub(r"(\b|:+)", r"[\1]", "a::bc"), '[]a[][::][]bc[]')
1797
1798        self.assertEqual(re.findall(r"\b|:+", "a::bc"), ['', '', '::', '', ''])
1799        self.assertEqual(re.findall(r"\b|\w+", "a::bc"),
1800                         ['', 'a', '', '', 'bc', ''])
1801
1802        self.assertEqual([m.span() for m in re.finditer(r"\b|:+", "a::bc")],
1803                         [(0, 0), (1, 1), (1, 3), (3, 3), (5, 5)])
1804        self.assertEqual([m.span() for m in re.finditer(r"\b|\w+", "a::bc")],
1805                         [(0, 0), (0, 1), (1, 1), (3, 3), (3, 5), (5, 5)])
1806
1807    def test_bug_2537(self):
1808        # issue 2537: empty submatches
1809        for outer_op in ('{0,}', '*', '+', '{1,187}'):
1810            for inner_op in ('{0,}', '*', '?'):
1811                r = re.compile("^((x|y)%s)%s" % (inner_op, outer_op))
1812                m = r.match("xyyzy")
1813                self.assertEqual(m.group(0), "xyy")
1814                self.assertEqual(m.group(1), "")
1815                self.assertEqual(m.group(2), "y")
1816
1817    @cpython_only
1818    def test_debug_flag(self):
1819        pat = r'(\.)(?:[ch]|py)(?(1)$|: )'
1820        with captured_stdout() as out:
1821            re.compile(pat, re.DEBUG)
1822        self.maxDiff = None
1823        dump = '''\
1824SUBPATTERN 1 0 0
1825  LITERAL 46
1826BRANCH
1827  IN
1828    LITERAL 99
1829    LITERAL 104
1830OR
1831  LITERAL 112
1832  LITERAL 121
1833GROUPREF_EXISTS 1
1834  AT AT_END
1835ELSE
1836  LITERAL 58
1837  LITERAL 32
1838
1839 0. INFO 8 0b1 2 5 (to 9)
1840      prefix_skip 0
1841      prefix [0x2e] ('.')
1842      overlap [0]
1843 9: MARK 0
184411. LITERAL 0x2e ('.')
184513. MARK 1
184615. BRANCH 10 (to 26)
184717.   IN 6 (to 24)
184819.     LITERAL 0x63 ('c')
184921.     LITERAL 0x68 ('h')
185023.     FAILURE
185124:   JUMP 9 (to 34)
185226: branch 7 (to 33)
185327.   LITERAL 0x70 ('p')
185429.   LITERAL 0x79 ('y')
185531.   JUMP 2 (to 34)
185633: FAILURE
185734: GROUPREF_EXISTS 0 6 (to 41)
185837. AT END
185939. JUMP 5 (to 45)
186041: LITERAL 0x3a (':')
186143. LITERAL 0x20 (' ')
186245: SUCCESS
1863'''
1864        self.assertEqual(out.getvalue(), dump)
1865        # Debug output is output again even a second time (bypassing
1866        # the cache -- issue #20426).
1867        with captured_stdout() as out:
1868            re.compile(pat, re.DEBUG)
1869        self.assertEqual(out.getvalue(), dump)
1870
1871    def test_keyword_parameters(self):
1872        # Issue #20283: Accepting the string keyword parameter.
1873        pat = re.compile(r'(ab)')
1874        self.assertEqual(
1875            pat.match(string='abracadabra', pos=7, endpos=10).span(), (7, 9))
1876        self.assertEqual(
1877            pat.fullmatch(string='abracadabra', pos=7, endpos=9).span(), (7, 9))
1878        self.assertEqual(
1879            pat.search(string='abracadabra', pos=3, endpos=10).span(), (7, 9))
1880        self.assertEqual(
1881            pat.findall(string='abracadabra', pos=3, endpos=10), ['ab'])
1882        self.assertEqual(
1883            pat.split(string='abracadabra', maxsplit=1),
1884            ['', 'ab', 'racadabra'])
1885        self.assertEqual(
1886            pat.scanner(string='abracadabra', pos=3, endpos=10).search().span(),
1887            (7, 9))
1888
1889    def test_bug_20998(self):
1890        # Issue #20998: Fullmatch of repeated single character pattern
1891        # with ignore case.
1892        self.assertEqual(re.fullmatch('[a-c]+', 'ABC', re.I).span(), (0, 3))
1893
1894    def test_locale_caching(self):
1895        # Issue #22410
1896        oldlocale = locale.setlocale(locale.LC_CTYPE)
1897        self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale)
1898        for loc in 'en_US.iso88591', 'en_US.utf8':
1899            try:
1900                locale.setlocale(locale.LC_CTYPE, loc)
1901            except locale.Error:
1902                # Unsupported locale on this system
1903                self.skipTest('test needs %s locale' % loc)
1904
1905        re.purge()
1906        self.check_en_US_iso88591()
1907        self.check_en_US_utf8()
1908        re.purge()
1909        self.check_en_US_utf8()
1910        self.check_en_US_iso88591()
1911
1912    def check_en_US_iso88591(self):
1913        locale.setlocale(locale.LC_CTYPE, 'en_US.iso88591')
1914        self.assertTrue(re.match(b'\xc5\xe5', b'\xc5\xe5', re.L|re.I))
1915        self.assertTrue(re.match(b'\xc5', b'\xe5', re.L|re.I))
1916        self.assertTrue(re.match(b'\xe5', b'\xc5', re.L|re.I))
1917        self.assertTrue(re.match(b'(?Li)\xc5\xe5', b'\xc5\xe5'))
1918        self.assertTrue(re.match(b'(?Li)\xc5', b'\xe5'))
1919        self.assertTrue(re.match(b'(?Li)\xe5', b'\xc5'))
1920
1921    def check_en_US_utf8(self):
1922        locale.setlocale(locale.LC_CTYPE, 'en_US.utf8')
1923        self.assertTrue(re.match(b'\xc5\xe5', b'\xc5\xe5', re.L|re.I))
1924        self.assertIsNone(re.match(b'\xc5', b'\xe5', re.L|re.I))
1925        self.assertIsNone(re.match(b'\xe5', b'\xc5', re.L|re.I))
1926        self.assertTrue(re.match(b'(?Li)\xc5\xe5', b'\xc5\xe5'))
1927        self.assertIsNone(re.match(b'(?Li)\xc5', b'\xe5'))
1928        self.assertIsNone(re.match(b'(?Li)\xe5', b'\xc5'))
1929
1930    def test_locale_compiled(self):
1931        oldlocale = locale.setlocale(locale.LC_CTYPE)
1932        self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale)
1933        for loc in 'en_US.iso88591', 'en_US.utf8':
1934            try:
1935                locale.setlocale(locale.LC_CTYPE, loc)
1936            except locale.Error:
1937                # Unsupported locale on this system
1938                self.skipTest('test needs %s locale' % loc)
1939
1940        locale.setlocale(locale.LC_CTYPE, 'en_US.iso88591')
1941        p1 = re.compile(b'\xc5\xe5', re.L|re.I)
1942        p2 = re.compile(b'[a\xc5][a\xe5]', re.L|re.I)
1943        p3 = re.compile(b'[az\xc5][az\xe5]', re.L|re.I)
1944        p4 = re.compile(b'[^\xc5][^\xe5]', re.L|re.I)
1945        for p in p1, p2, p3:
1946            self.assertTrue(p.match(b'\xc5\xe5'))
1947            self.assertTrue(p.match(b'\xe5\xe5'))
1948            self.assertTrue(p.match(b'\xc5\xc5'))
1949        self.assertIsNone(p4.match(b'\xe5\xc5'))
1950        self.assertIsNone(p4.match(b'\xe5\xe5'))
1951        self.assertIsNone(p4.match(b'\xc5\xc5'))
1952
1953        locale.setlocale(locale.LC_CTYPE, 'en_US.utf8')
1954        for p in p1, p2, p3:
1955            self.assertTrue(p.match(b'\xc5\xe5'))
1956            self.assertIsNone(p.match(b'\xe5\xe5'))
1957            self.assertIsNone(p.match(b'\xc5\xc5'))
1958        self.assertTrue(p4.match(b'\xe5\xc5'))
1959        self.assertIsNone(p4.match(b'\xe5\xe5'))
1960        self.assertIsNone(p4.match(b'\xc5\xc5'))
1961
1962    def test_error(self):
1963        with self.assertRaises(re.error) as cm:
1964            re.compile('(\u20ac))')
1965        err = cm.exception
1966        self.assertIsInstance(err.pattern, str)
1967        self.assertEqual(err.pattern, '(\u20ac))')
1968        self.assertEqual(err.pos, 3)
1969        self.assertEqual(err.lineno, 1)
1970        self.assertEqual(err.colno, 4)
1971        self.assertIn(err.msg, str(err))
1972        self.assertIn(' at position 3', str(err))
1973        self.assertNotIn(' at position 3', err.msg)
1974        # Bytes pattern
1975        with self.assertRaises(re.error) as cm:
1976            re.compile(b'(\xa4))')
1977        err = cm.exception
1978        self.assertIsInstance(err.pattern, bytes)
1979        self.assertEqual(err.pattern, b'(\xa4))')
1980        self.assertEqual(err.pos, 3)
1981        # Multiline pattern
1982        with self.assertRaises(re.error) as cm:
1983            re.compile("""
1984                (
1985                    abc
1986                )
1987                )
1988                (
1989                """, re.VERBOSE)
1990        err = cm.exception
1991        self.assertEqual(err.pos, 77)
1992        self.assertEqual(err.lineno, 5)
1993        self.assertEqual(err.colno, 17)
1994        self.assertIn(err.msg, str(err))
1995        self.assertIn(' at position 77', str(err))
1996        self.assertIn('(line 5, column 17)', str(err))
1997
1998    def test_misc_errors(self):
1999        self.checkPatternError(r'(', 'missing ), unterminated subpattern', 0)
2000        self.checkPatternError(r'((a|b)', 'missing ), unterminated subpattern', 0)
2001        self.checkPatternError(r'(a|b))', 'unbalanced parenthesis', 5)
2002        self.checkPatternError(r'(?P', 'unexpected end of pattern', 3)
2003        self.checkPatternError(r'(?z)', 'unknown extension ?z', 1)
2004        self.checkPatternError(r'(?iz)', 'unknown flag', 3)
2005        self.checkPatternError(r'(?i', 'missing -, : or )', 3)
2006        self.checkPatternError(r'(?#abc', 'missing ), unterminated comment', 0)
2007        self.checkPatternError(r'(?<', 'unexpected end of pattern', 3)
2008        self.checkPatternError(r'(?<>)', 'unknown extension ?<>', 1)
2009        self.checkPatternError(r'(?', 'unexpected end of pattern', 2)
2010
2011    def test_enum(self):
2012        # Issue #28082: Check that str(flag) returns a human readable string
2013        # instead of an integer
2014        self.assertIn('ASCII', str(re.A))
2015        self.assertIn('DOTALL', str(re.S))
2016
2017    def test_pattern_compare(self):
2018        pattern1 = re.compile('abc', re.IGNORECASE)
2019
2020        # equal to itself
2021        self.assertEqual(pattern1, pattern1)
2022        self.assertFalse(pattern1 != pattern1)
2023
2024        # equal
2025        re.purge()
2026        pattern2 = re.compile('abc', re.IGNORECASE)
2027        self.assertEqual(hash(pattern2), hash(pattern1))
2028        self.assertEqual(pattern2, pattern1)
2029
2030        # not equal: different pattern
2031        re.purge()
2032        pattern3 = re.compile('XYZ', re.IGNORECASE)
2033        # Don't test hash(pattern3) != hash(pattern1) because there is no
2034        # warranty that hash values are different
2035        self.assertNotEqual(pattern3, pattern1)
2036
2037        # not equal: different flag (flags=0)
2038        re.purge()
2039        pattern4 = re.compile('abc')
2040        self.assertNotEqual(pattern4, pattern1)
2041
2042        # only == and != comparison operators are supported
2043        with self.assertRaises(TypeError):
2044            pattern1 < pattern2
2045
2046    def test_pattern_compare_bytes(self):
2047        pattern1 = re.compile(b'abc')
2048
2049        # equal: test bytes patterns
2050        re.purge()
2051        pattern2 = re.compile(b'abc')
2052        self.assertEqual(hash(pattern2), hash(pattern1))
2053        self.assertEqual(pattern2, pattern1)
2054
2055        # not equal: pattern of a different types (str vs bytes),
2056        # comparison must not raise a BytesWarning
2057        re.purge()
2058        pattern3 = re.compile('abc')
2059        with warnings.catch_warnings():
2060            warnings.simplefilter('error', BytesWarning)
2061            self.assertNotEqual(pattern3, pattern1)
2062
2063    def test_bug_29444(self):
2064        s = bytearray(b'abcdefgh')
2065        m = re.search(b'[a-h]+', s)
2066        m2 = re.search(b'[e-h]+', s)
2067        self.assertEqual(m.group(), b'abcdefgh')
2068        self.assertEqual(m2.group(), b'efgh')
2069        s[:] = b'xyz'
2070        self.assertEqual(m.group(), b'xyz')
2071        self.assertEqual(m2.group(), b'')
2072
2073    def test_bug_34294(self):
2074        # Issue 34294: wrong capturing groups
2075
2076        # exists since Python 2
2077        s = "a\tx"
2078        p = r"\b(?=(\t)|(x))x"
2079        self.assertEqual(re.search(p, s).groups(), (None, 'x'))
2080
2081        # introduced in Python 3.7.0
2082        s = "ab"
2083        p = r"(?=(.)(.)?)"
2084        self.assertEqual(re.findall(p, s),
2085                         [('a', 'b'), ('b', '')])
2086        self.assertEqual([m.groups() for m in re.finditer(p, s)],
2087                         [('a', 'b'), ('b', None)])
2088
2089        # test-cases provided by issue34294, introduced in Python 3.7.0
2090        p = r"(?=<(?P<tag>\w+)/?>(?:(?P<text>.+?)</(?P=tag)>)?)"
2091        s = "<test><foo2/></test>"
2092        self.assertEqual(re.findall(p, s),
2093                         [('test', '<foo2/>'), ('foo2', '')])
2094        self.assertEqual([m.groupdict() for m in re.finditer(p, s)],
2095                         [{'tag': 'test', 'text': '<foo2/>'},
2096                          {'tag': 'foo2', 'text': None}])
2097        s = "<test>Hello</test><foo/>"
2098        self.assertEqual([m.groupdict() for m in re.finditer(p, s)],
2099                         [{'tag': 'test', 'text': 'Hello'},
2100                          {'tag': 'foo', 'text': None}])
2101        s = "<test>Hello</test><foo/><foo/>"
2102        self.assertEqual([m.groupdict() for m in re.finditer(p, s)],
2103                         [{'tag': 'test', 'text': 'Hello'},
2104                          {'tag': 'foo', 'text': None},
2105                          {'tag': 'foo', 'text': None}])
2106
2107
2108class PatternReprTests(unittest.TestCase):
2109    def check(self, pattern, expected):
2110        self.assertEqual(repr(re.compile(pattern)), expected)
2111
2112    def check_flags(self, pattern, flags, expected):
2113        self.assertEqual(repr(re.compile(pattern, flags)), expected)
2114
2115    def test_without_flags(self):
2116        self.check('random pattern',
2117                   "re.compile('random pattern')")
2118
2119    def test_single_flag(self):
2120        self.check_flags('random pattern', re.IGNORECASE,
2121            "re.compile('random pattern', re.IGNORECASE)")
2122
2123    def test_multiple_flags(self):
2124        self.check_flags('random pattern', re.I|re.S|re.X,
2125            "re.compile('random pattern', "
2126            "re.IGNORECASE|re.DOTALL|re.VERBOSE)")
2127
2128    def test_unicode_flag(self):
2129        self.check_flags('random pattern', re.U,
2130                         "re.compile('random pattern')")
2131        self.check_flags('random pattern', re.I|re.S|re.U,
2132                         "re.compile('random pattern', "
2133                         "re.IGNORECASE|re.DOTALL)")
2134
2135    def test_inline_flags(self):
2136        self.check('(?i)pattern',
2137                   "re.compile('(?i)pattern', re.IGNORECASE)")
2138
2139    def test_unknown_flags(self):
2140        self.check_flags('random pattern', 0x123000,
2141                         "re.compile('random pattern', 0x123000)")
2142        self.check_flags('random pattern', 0x123000|re.I,
2143            "re.compile('random pattern', re.IGNORECASE|0x123000)")
2144
2145    def test_bytes(self):
2146        self.check(b'bytes pattern',
2147                   "re.compile(b'bytes pattern')")
2148        self.check_flags(b'bytes pattern', re.A,
2149                         "re.compile(b'bytes pattern', re.ASCII)")
2150
2151    def test_locale(self):
2152        self.check_flags(b'bytes pattern', re.L,
2153                         "re.compile(b'bytes pattern', re.LOCALE)")
2154
2155    def test_quotes(self):
2156        self.check('random "double quoted" pattern',
2157            '''re.compile('random "double quoted" pattern')''')
2158        self.check("random 'single quoted' pattern",
2159            '''re.compile("random 'single quoted' pattern")''')
2160        self.check('''both 'single' and "double" quotes''',
2161            '''re.compile('both \\'single\\' and "double" quotes')''')
2162
2163    def test_long_pattern(self):
2164        pattern = 'Very %spattern' % ('long ' * 1000)
2165        r = repr(re.compile(pattern))
2166        self.assertLess(len(r), 300)
2167        self.assertEqual(r[:30], "re.compile('Very long long lon")
2168        r = repr(re.compile(pattern, re.I))
2169        self.assertLess(len(r), 300)
2170        self.assertEqual(r[:30], "re.compile('Very long long lon")
2171        self.assertEqual(r[-16:], ", re.IGNORECASE)")
2172
2173    def test_flags_repr(self):
2174        self.assertEqual(repr(re.I), "re.IGNORECASE")
2175        self.assertEqual(repr(re.I|re.S|re.X),
2176                         "re.IGNORECASE|re.DOTALL|re.VERBOSE")
2177        self.assertEqual(repr(re.I|re.S|re.X|(1<<20)),
2178                         "re.IGNORECASE|re.DOTALL|re.VERBOSE|0x100000")
2179        self.assertEqual(repr(~re.I), "~re.IGNORECASE")
2180        self.assertEqual(repr(~(re.I|re.S|re.X)),
2181                         "~(re.IGNORECASE|re.DOTALL|re.VERBOSE)")
2182        self.assertEqual(repr(~(re.I|re.S|re.X|(1<<20))),
2183                         "~(re.IGNORECASE|re.DOTALL|re.VERBOSE|0x100000)")
2184
2185
2186class ImplementationTest(unittest.TestCase):
2187    """
2188    Test implementation details of the re module.
2189    """
2190
2191    def test_overlap_table(self):
2192        f = sre_compile._generate_overlap_table
2193        self.assertEqual(f(""), [])
2194        self.assertEqual(f("a"), [0])
2195        self.assertEqual(f("abcd"), [0, 0, 0, 0])
2196        self.assertEqual(f("aaaa"), [0, 1, 2, 3])
2197        self.assertEqual(f("ababba"), [0, 0, 1, 2, 0, 1])
2198        self.assertEqual(f("abcabdac"), [0, 0, 0, 1, 2, 0, 1, 0])
2199
2200
2201class ExternalTests(unittest.TestCase):
2202
2203    def test_re_benchmarks(self):
2204        're_tests benchmarks'
2205        from test.re_tests import benchmarks
2206        for pattern, s in benchmarks:
2207            with self.subTest(pattern=pattern, string=s):
2208                p = re.compile(pattern)
2209                self.assertTrue(p.search(s))
2210                self.assertTrue(p.match(s))
2211                self.assertTrue(p.fullmatch(s))
2212                s2 = ' '*10000 + s + ' '*10000
2213                self.assertTrue(p.search(s2))
2214                self.assertTrue(p.match(s2, 10000))
2215                self.assertTrue(p.match(s2, 10000, 10000 + len(s)))
2216                self.assertTrue(p.fullmatch(s2, 10000, 10000 + len(s)))
2217
2218    def test_re_tests(self):
2219        're_tests test suite'
2220        from test.re_tests import tests, SUCCEED, FAIL, SYNTAX_ERROR
2221        for t in tests:
2222            pattern = s = outcome = repl = expected = None
2223            if len(t) == 5:
2224                pattern, s, outcome, repl, expected = t
2225            elif len(t) == 3:
2226                pattern, s, outcome = t
2227            else:
2228                raise ValueError('Test tuples should have 3 or 5 fields', t)
2229
2230            with self.subTest(pattern=pattern, string=s):
2231                if outcome == SYNTAX_ERROR:  # Expected a syntax error
2232                    with self.assertRaises(re.error):
2233                        re.compile(pattern)
2234                    continue
2235
2236                obj = re.compile(pattern)
2237                result = obj.search(s)
2238                if outcome == FAIL:
2239                    self.assertIsNone(result, 'Succeeded incorrectly')
2240                    continue
2241
2242                with self.subTest():
2243                    self.assertTrue(result, 'Failed incorrectly')
2244                    # Matched, as expected, so now we compute the
2245                    # result string and compare it to our expected result.
2246                    start, end = result.span(0)
2247                    vardict = {'found': result.group(0),
2248                               'groups': result.group(),
2249                               'flags': result.re.flags}
2250                    for i in range(1, 100):
2251                        try:
2252                            gi = result.group(i)
2253                            # Special hack because else the string concat fails:
2254                            if gi is None:
2255                                gi = "None"
2256                        except IndexError:
2257                            gi = "Error"
2258                        vardict['g%d' % i] = gi
2259                    for i in result.re.groupindex.keys():
2260                        try:
2261                            gi = result.group(i)
2262                            if gi is None:
2263                                gi = "None"
2264                        except IndexError:
2265                            gi = "Error"
2266                        vardict[i] = gi
2267                    self.assertEqual(eval(repl, vardict), expected,
2268                                     'grouping error')
2269
2270                # Try the match with both pattern and string converted to
2271                # bytes, and check that it still succeeds.
2272                try:
2273                    bpat = bytes(pattern, "ascii")
2274                    bs = bytes(s, "ascii")
2275                except UnicodeEncodeError:
2276                    # skip non-ascii tests
2277                    pass
2278                else:
2279                    with self.subTest('bytes pattern match'):
2280                        obj = re.compile(bpat)
2281                        self.assertTrue(obj.search(bs))
2282
2283                    # Try the match with LOCALE enabled, and check that it
2284                    # still succeeds.
2285                    with self.subTest('locale-sensitive match'):
2286                        obj = re.compile(bpat, re.LOCALE)
2287                        result = obj.search(bs)
2288                        if result is None:
2289                            print('=== Fails on locale-sensitive match', t)
2290
2291                # Try the match with the search area limited to the extent
2292                # of the match and see if it still succeeds.  \B will
2293                # break (because it won't match at the end or start of a
2294                # string), so we'll ignore patterns that feature it.
2295                if (pattern[:2] != r'\B' and pattern[-2:] != r'\B'
2296                            and result is not None):
2297                    with self.subTest('range-limited match'):
2298                        obj = re.compile(pattern)
2299                        self.assertTrue(obj.search(s, start, end + 1))
2300
2301                # Try the match with IGNORECASE enabled, and check that it
2302                # still succeeds.
2303                with self.subTest('case-insensitive match'):
2304                    obj = re.compile(pattern, re.IGNORECASE)
2305                    self.assertTrue(obj.search(s))
2306
2307                # Try the match with UNICODE locale enabled, and check
2308                # that it still succeeds.
2309                with self.subTest('unicode-sensitive match'):
2310                    obj = re.compile(pattern, re.UNICODE)
2311                    self.assertTrue(obj.search(s))
2312
2313
2314if __name__ == "__main__":
2315    unittest.main()
2316