• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1from test.support import (gc_collect, bigmemtest, _2G,
2                          cpython_only, captured_stdout,
3                          check_disallow_instantiation)
4import locale
5import re
6import sre_compile
7import string
8import unittest
9import warnings
10from re import Scanner
11from weakref import proxy
12
13# Misc tests from Tim Peters' re.doc
14
15# WARNING: Don't change details in these tests if you don't know
16# what you're doing. Some of these tests were carefully modeled to
17# cover most of the code.
18
19class S(str):
20    def __getitem__(self, index):
21        return S(super().__getitem__(index))
22
23class B(bytes):
24    def __getitem__(self, index):
25        return B(super().__getitem__(index))
26
27class ReTests(unittest.TestCase):
28
29    def assertTypedEqual(self, actual, expect, msg=None):
30        self.assertEqual(actual, expect, msg)
31        def recurse(actual, expect):
32            if isinstance(expect, (tuple, list)):
33                for x, y in zip(actual, expect):
34                    recurse(x, y)
35            else:
36                self.assertIs(type(actual), type(expect), msg)
37        recurse(actual, expect)
38
39    def checkPatternError(self, pattern, errmsg, pos=None):
40        with self.assertRaises(re.error) as cm:
41            re.compile(pattern)
42        with self.subTest(pattern=pattern):
43            err = cm.exception
44            self.assertEqual(err.msg, errmsg)
45            if pos is not None:
46                self.assertEqual(err.pos, pos)
47
48    def checkTemplateError(self, pattern, repl, string, errmsg, pos=None):
49        with self.assertRaises(re.error) as cm:
50            re.sub(pattern, repl, string)
51        with self.subTest(pattern=pattern, repl=repl):
52            err = cm.exception
53            self.assertEqual(err.msg, errmsg)
54            if pos is not None:
55                self.assertEqual(err.pos, pos)
56
57    def test_keep_buffer(self):
58        # See bug 14212
59        b = bytearray(b'x')
60        it = re.finditer(b'a', b)
61        with self.assertRaises(BufferError):
62            b.extend(b'x'*400)
63        list(it)
64        del it
65        gc_collect()
66        b.extend(b'x'*400)
67
68    def test_weakref(self):
69        s = 'QabbbcR'
70        x = re.compile('ab+c')
71        y = proxy(x)
72        self.assertEqual(x.findall('QabbbcR'), y.findall('QabbbcR'))
73
74    def test_search_star_plus(self):
75        self.assertEqual(re.search('x*', 'axx').span(0), (0, 0))
76        self.assertEqual(re.search('x*', 'axx').span(), (0, 0))
77        self.assertEqual(re.search('x+', 'axx').span(0), (1, 3))
78        self.assertEqual(re.search('x+', 'axx').span(), (1, 3))
79        self.assertIsNone(re.search('x', 'aaa'))
80        self.assertEqual(re.match('a*', 'xxx').span(0), (0, 0))
81        self.assertEqual(re.match('a*', 'xxx').span(), (0, 0))
82        self.assertEqual(re.match('x*', 'xxxa').span(0), (0, 3))
83        self.assertEqual(re.match('x*', 'xxxa').span(), (0, 3))
84        self.assertIsNone(re.match('a+', 'xxx'))
85
86    def bump_num(self, matchobj):
87        int_value = int(matchobj.group(0))
88        return str(int_value + 1)
89
90    def test_basic_re_sub(self):
91        self.assertTypedEqual(re.sub('y', 'a', 'xyz'), 'xaz')
92        self.assertTypedEqual(re.sub('y', S('a'), S('xyz')), 'xaz')
93        self.assertTypedEqual(re.sub(b'y', b'a', b'xyz'), b'xaz')
94        self.assertTypedEqual(re.sub(b'y', B(b'a'), B(b'xyz')), b'xaz')
95        self.assertTypedEqual(re.sub(b'y', bytearray(b'a'), bytearray(b'xyz')), b'xaz')
96        self.assertTypedEqual(re.sub(b'y', memoryview(b'a'), memoryview(b'xyz')), b'xaz')
97        for y in ("\xe0", "\u0430", "\U0001d49c"):
98            self.assertEqual(re.sub(y, 'a', 'x%sz' % y), 'xaz')
99
100        self.assertEqual(re.sub("(?i)b+", "x", "bbbb BBBB"), 'x x')
101        self.assertEqual(re.sub(r'\d+', self.bump_num, '08.2 -2 23x99y'),
102                         '9.3 -3 24x100y')
103        self.assertEqual(re.sub(r'\d+', self.bump_num, '08.2 -2 23x99y', 3),
104                         '9.3 -3 23x99y')
105        self.assertEqual(re.sub(r'\d+', self.bump_num, '08.2 -2 23x99y', count=3),
106                         '9.3 -3 23x99y')
107
108        self.assertEqual(re.sub('.', lambda m: r"\n", 'x'), '\\n')
109        self.assertEqual(re.sub('.', r"\n", 'x'), '\n')
110
111        s = r"\1\1"
112        self.assertEqual(re.sub('(.)', s, 'x'), 'xx')
113        self.assertEqual(re.sub('(.)', s.replace('\\', r'\\'), 'x'), s)
114        self.assertEqual(re.sub('(.)', lambda m: s, 'x'), s)
115
116        self.assertEqual(re.sub('(?P<a>x)', r'\g<a>\g<a>', 'xx'), 'xxxx')
117        self.assertEqual(re.sub('(?P<a>x)', r'\g<a>\g<1>', 'xx'), 'xxxx')
118        self.assertEqual(re.sub('(?P<unk>x)', r'\g<unk>\g<unk>', 'xx'), 'xxxx')
119        self.assertEqual(re.sub('(?P<unk>x)', r'\g<1>\g<1>', 'xx'), 'xxxx')
120
121        self.assertEqual(re.sub('a', r'\t\n\v\r\f\a\b', 'a'), '\t\n\v\r\f\a\b')
122        self.assertEqual(re.sub('a', '\t\n\v\r\f\a\b', 'a'), '\t\n\v\r\f\a\b')
123        self.assertEqual(re.sub('a', '\t\n\v\r\f\a\b', 'a'),
124                         (chr(9)+chr(10)+chr(11)+chr(13)+chr(12)+chr(7)+chr(8)))
125        for c in 'cdehijklmopqsuwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ':
126            with self.subTest(c):
127                with self.assertRaises(re.error):
128                    self.assertEqual(re.sub('a', '\\' + c, 'a'), '\\' + c)
129
130        self.assertEqual(re.sub(r'^\s*', 'X', 'test'), 'Xtest')
131
132    def test_bug_449964(self):
133        # fails for group followed by other escape
134        self.assertEqual(re.sub(r'(?P<unk>x)', r'\g<1>\g<1>\b', 'xx'),
135                         'xx\bxx\b')
136
137    def test_bug_449000(self):
138        # Test for sub() on escaped characters
139        self.assertEqual(re.sub(r'\r\n', r'\n', 'abc\r\ndef\r\n'),
140                         'abc\ndef\n')
141        self.assertEqual(re.sub('\r\n', r'\n', 'abc\r\ndef\r\n'),
142                         'abc\ndef\n')
143        self.assertEqual(re.sub(r'\r\n', '\n', 'abc\r\ndef\r\n'),
144                         'abc\ndef\n')
145        self.assertEqual(re.sub('\r\n', '\n', 'abc\r\ndef\r\n'),
146                         'abc\ndef\n')
147
148    def test_bug_1661(self):
149        # Verify that flags do not get silently ignored with compiled patterns
150        pattern = re.compile('.')
151        self.assertRaises(ValueError, re.match, pattern, 'A', re.I)
152        self.assertRaises(ValueError, re.search, pattern, 'A', re.I)
153        self.assertRaises(ValueError, re.findall, pattern, 'A', re.I)
154        self.assertRaises(ValueError, re.compile, pattern, re.I)
155
156    def test_bug_3629(self):
157        # A regex that triggered a bug in the sre-code validator
158        re.compile("(?P<quote>)(?(quote))")
159
160    def test_sub_template_numeric_escape(self):
161        # bug 776311 and friends
162        self.assertEqual(re.sub('x', r'\0', 'x'), '\0')
163        self.assertEqual(re.sub('x', r'\000', 'x'), '\000')
164        self.assertEqual(re.sub('x', r'\001', 'x'), '\001')
165        self.assertEqual(re.sub('x', r'\008', 'x'), '\0' + '8')
166        self.assertEqual(re.sub('x', r'\009', 'x'), '\0' + '9')
167        self.assertEqual(re.sub('x', r'\111', 'x'), '\111')
168        self.assertEqual(re.sub('x', r'\117', 'x'), '\117')
169        self.assertEqual(re.sub('x', r'\377', 'x'), '\377')
170
171        self.assertEqual(re.sub('x', r'\1111', 'x'), '\1111')
172        self.assertEqual(re.sub('x', r'\1111', 'x'), '\111' + '1')
173
174        self.assertEqual(re.sub('x', r'\00', 'x'), '\x00')
175        self.assertEqual(re.sub('x', r'\07', 'x'), '\x07')
176        self.assertEqual(re.sub('x', r'\08', 'x'), '\0' + '8')
177        self.assertEqual(re.sub('x', r'\09', 'x'), '\0' + '9')
178        self.assertEqual(re.sub('x', r'\0a', 'x'), '\0' + 'a')
179
180        self.checkTemplateError('x', r'\400', 'x',
181                                r'octal escape value \400 outside of '
182                                r'range 0-0o377', 0)
183        self.checkTemplateError('x', r'\777', 'x',
184                                r'octal escape value \777 outside of '
185                                r'range 0-0o377', 0)
186
187        self.checkTemplateError('x', r'\1', 'x', 'invalid group reference 1', 1)
188        self.checkTemplateError('x', r'\8', 'x', 'invalid group reference 8', 1)
189        self.checkTemplateError('x', r'\9', 'x', 'invalid group reference 9', 1)
190        self.checkTemplateError('x', r'\11', 'x', 'invalid group reference 11', 1)
191        self.checkTemplateError('x', r'\18', 'x', 'invalid group reference 18', 1)
192        self.checkTemplateError('x', r'\1a', 'x', 'invalid group reference 1', 1)
193        self.checkTemplateError('x', r'\90', 'x', 'invalid group reference 90', 1)
194        self.checkTemplateError('x', r'\99', 'x', 'invalid group reference 99', 1)
195        self.checkTemplateError('x', r'\118', 'x', 'invalid group reference 11', 1)
196        self.checkTemplateError('x', r'\11a', 'x', 'invalid group reference 11', 1)
197        self.checkTemplateError('x', r'\181', 'x', 'invalid group reference 18', 1)
198        self.checkTemplateError('x', r'\800', 'x', 'invalid group reference 80', 1)
199        self.checkTemplateError('x', r'\8', '', 'invalid group reference 8', 1)
200
201        # in python2.3 (etc), these loop endlessly in sre_parser.py
202        self.assertEqual(re.sub('(((((((((((x)))))))))))', r'\11', 'x'), 'x')
203        self.assertEqual(re.sub('((((((((((y))))))))))(.)', r'\118', 'xyz'),
204                         'xz8')
205        self.assertEqual(re.sub('((((((((((y))))))))))(.)', r'\11a', 'xyz'),
206                         'xza')
207
208    def test_qualified_re_sub(self):
209        self.assertEqual(re.sub('a', 'b', 'aaaaa'), 'bbbbb')
210        self.assertEqual(re.sub('a', 'b', 'aaaaa', 1), 'baaaa')
211        self.assertEqual(re.sub('a', 'b', 'aaaaa', count=1), 'baaaa')
212
213    def test_bug_114660(self):
214        self.assertEqual(re.sub(r'(\S)\s+(\S)', r'\1 \2', 'hello  there'),
215                         'hello there')
216
217    def test_symbolic_groups(self):
218        re.compile(r'(?P<a>x)(?P=a)(?(a)y)')
219        re.compile(r'(?P<a1>x)(?P=a1)(?(a1)y)')
220        re.compile(r'(?P<a1>x)\1(?(1)y)')
221        self.checkPatternError(r'(?P<a>)(?P<a>)',
222                               "redefinition of group name 'a' as group 2; "
223                               "was group 1")
224        self.checkPatternError(r'(?P<a>(?P=a))',
225                               "cannot refer to an open group", 10)
226        self.checkPatternError(r'(?Pxy)', 'unknown extension ?Px')
227        self.checkPatternError(r'(?P<a>)(?P=a', 'missing ), unterminated name', 11)
228        self.checkPatternError(r'(?P=', 'missing group name', 4)
229        self.checkPatternError(r'(?P=)', 'missing group name', 4)
230        self.checkPatternError(r'(?P=1)', "bad character in group name '1'", 4)
231        self.checkPatternError(r'(?P=a)', "unknown group name 'a'")
232        self.checkPatternError(r'(?P=a1)', "unknown group name 'a1'")
233        self.checkPatternError(r'(?P=a.)', "bad character in group name 'a.'", 4)
234        self.checkPatternError(r'(?P<)', 'missing >, unterminated name', 4)
235        self.checkPatternError(r'(?P<a', 'missing >, unterminated name', 4)
236        self.checkPatternError(r'(?P<', 'missing group name', 4)
237        self.checkPatternError(r'(?P<>)', 'missing group name', 4)
238        self.checkPatternError(r'(?P<1>)', "bad character in group name '1'", 4)
239        self.checkPatternError(r'(?P<a.>)', "bad character in group name 'a.'", 4)
240        self.checkPatternError(r'(?(', 'missing group name', 3)
241        self.checkPatternError(r'(?())', 'missing group name', 3)
242        self.checkPatternError(r'(?(a))', "unknown group name 'a'", 3)
243        self.checkPatternError(r'(?(-1))', "bad character in group name '-1'", 3)
244        self.checkPatternError(r'(?(1a))', "bad character in group name '1a'", 3)
245        self.checkPatternError(r'(?(a.))', "bad character in group name 'a.'", 3)
246        # New valid/invalid identifiers in Python 3
247        re.compile('(?P<µ>x)(?P=µ)(?(µ)y)')
248        re.compile('(?P<��������������>x)(?P=��������������)(?(��������������)y)')
249        self.checkPatternError('(?P<©>x)', "bad character in group name '©'", 4)
250        # Support > 100 groups.
251        pat = '|'.join('x(?P<a%d>%x)y' % (i, i) for i in range(1, 200 + 1))
252        pat = '(?:%s)(?(200)z|t)' % pat
253        self.assertEqual(re.match(pat, 'xc8yz').span(), (0, 5))
254
255    def test_symbolic_refs(self):
256        self.checkTemplateError('(?P<a>x)', r'\g<a', 'xx',
257                                'missing >, unterminated name', 3)
258        self.checkTemplateError('(?P<a>x)', r'\g<', 'xx',
259                                'missing group name', 3)
260        self.checkTemplateError('(?P<a>x)', r'\g', 'xx', 'missing <', 2)
261        self.checkTemplateError('(?P<a>x)', r'\g<a a>', 'xx',
262                                "bad character in group name 'a a'", 3)
263        self.checkTemplateError('(?P<a>x)', r'\g<>', 'xx',
264                                'missing group name', 3)
265        self.checkTemplateError('(?P<a>x)', r'\g<1a1>', 'xx',
266                                "bad character in group name '1a1'", 3)
267        self.checkTemplateError('(?P<a>x)', r'\g<2>', 'xx',
268                                'invalid group reference 2', 3)
269        self.checkTemplateError('(?P<a>x)', r'\2', 'xx',
270                                'invalid group reference 2', 1)
271        with self.assertRaisesRegex(IndexError, "unknown group name 'ab'"):
272            re.sub('(?P<a>x)', r'\g<ab>', 'xx')
273        self.assertEqual(re.sub('(?P<a>x)|(?P<b>y)', r'\g<b>', 'xx'), '')
274        self.assertEqual(re.sub('(?P<a>x)|(?P<b>y)', r'\2', 'xx'), '')
275        self.checkTemplateError('(?P<a>x)', r'\g<-1>', 'xx',
276                                "bad character in group name '-1'", 3)
277        # New valid/invalid identifiers in Python 3
278        self.assertEqual(re.sub('(?P<µ>x)', r'\g<µ>', 'xx'), 'xx')
279        self.assertEqual(re.sub('(?P<��������������>x)', r'\g<��������������>', 'xx'), 'xx')
280        self.checkTemplateError('(?P<a>x)', r'\g<©>', 'xx',
281                                "bad character in group name '©'", 3)
282        # Support > 100 groups.
283        pat = '|'.join('x(?P<a%d>%x)y' % (i, i) for i in range(1, 200 + 1))
284        self.assertEqual(re.sub(pat, r'\g<200>', 'xc8yzxc8y'), 'c8zc8')
285
286    def test_re_subn(self):
287        self.assertEqual(re.subn("(?i)b+", "x", "bbbb BBBB"), ('x x', 2))
288        self.assertEqual(re.subn("b+", "x", "bbbb BBBB"), ('x BBBB', 1))
289        self.assertEqual(re.subn("b+", "x", "xyz"), ('xyz', 0))
290        self.assertEqual(re.subn("b*", "x", "xyz"), ('xxxyxzx', 4))
291        self.assertEqual(re.subn("b*", "x", "xyz", 2), ('xxxyz', 2))
292        self.assertEqual(re.subn("b*", "x", "xyz", count=2), ('xxxyz', 2))
293
294    def test_re_split(self):
295        for string in ":a:b::c", S(":a:b::c"):
296            self.assertTypedEqual(re.split(":", string),
297                                  ['', 'a', 'b', '', 'c'])
298            self.assertTypedEqual(re.split(":+", string),
299                                  ['', 'a', 'b', 'c'])
300            self.assertTypedEqual(re.split("(:+)", string),
301                                  ['', ':', 'a', ':', 'b', '::', 'c'])
302        for string in (b":a:b::c", B(b":a:b::c"), bytearray(b":a:b::c"),
303                       memoryview(b":a:b::c")):
304            self.assertTypedEqual(re.split(b":", string),
305                                  [b'', b'a', b'b', b'', b'c'])
306            self.assertTypedEqual(re.split(b":+", string),
307                                  [b'', b'a', b'b', b'c'])
308            self.assertTypedEqual(re.split(b"(:+)", string),
309                                  [b'', b':', b'a', b':', b'b', b'::', b'c'])
310        for a, b, c in ("\xe0\xdf\xe7", "\u0430\u0431\u0432",
311                        "\U0001d49c\U0001d49e\U0001d4b5"):
312            string = ":%s:%s::%s" % (a, b, c)
313            self.assertEqual(re.split(":", string), ['', a, b, '', c])
314            self.assertEqual(re.split(":+", string), ['', a, b, c])
315            self.assertEqual(re.split("(:+)", string),
316                             ['', ':', a, ':', b, '::', c])
317
318        self.assertEqual(re.split("(?::+)", ":a:b::c"), ['', 'a', 'b', 'c'])
319        self.assertEqual(re.split("(:)+", ":a:b::c"),
320                         ['', ':', 'a', ':', 'b', ':', 'c'])
321        self.assertEqual(re.split("([b:]+)", ":a:b::c"),
322                         ['', ':', 'a', ':b::', 'c'])
323        self.assertEqual(re.split("(b)|(:+)", ":a:b::c"),
324                         ['', None, ':', 'a', None, ':', '', 'b', None, '',
325                          None, '::', 'c'])
326        self.assertEqual(re.split("(?:b)|(?::+)", ":a:b::c"),
327                         ['', 'a', '', '', 'c'])
328
329        for sep, expected in [
330            (':*', ['', '', 'a', '', 'b', '', 'c', '']),
331            ('(?::*)', ['', '', 'a', '', 'b', '', 'c', '']),
332            ('(:*)', ['', ':', '', '', 'a', ':', '', '', 'b', '::', '', '', 'c', '', '']),
333            ('(:)*', ['', ':', '', None, 'a', ':', '', None, 'b', ':', '', None, 'c', None, '']),
334        ]:
335            with self.subTest(sep=sep):
336                self.assertTypedEqual(re.split(sep, ':a:b::c'), expected)
337
338        for sep, expected in [
339            ('', ['', ':', 'a', ':', 'b', ':', ':', 'c', '']),
340            (r'\b', [':', 'a', ':', 'b', '::', 'c', '']),
341            (r'(?=:)', ['', ':a', ':b', ':', ':c']),
342            (r'(?<=:)', [':', 'a:', 'b:', ':', 'c']),
343        ]:
344            with self.subTest(sep=sep):
345                self.assertTypedEqual(re.split(sep, ':a:b::c'), expected)
346
347    def test_qualified_re_split(self):
348        self.assertEqual(re.split(":", ":a:b::c", 2), ['', 'a', 'b::c'])
349        self.assertEqual(re.split(":", ":a:b::c", maxsplit=2), ['', 'a', 'b::c'])
350        self.assertEqual(re.split(':', 'a:b:c:d', maxsplit=2), ['a', 'b', 'c:d'])
351        self.assertEqual(re.split("(:)", ":a:b::c", maxsplit=2),
352                         ['', ':', 'a', ':', 'b::c'])
353        self.assertEqual(re.split("(:+)", ":a:b::c", maxsplit=2),
354                         ['', ':', 'a', ':', 'b::c'])
355        self.assertEqual(re.split("(:*)", ":a:b::c", maxsplit=2),
356                         ['', ':', '', '', 'a:b::c'])
357
358    def test_re_findall(self):
359        self.assertEqual(re.findall(":+", "abc"), [])
360        for string in "a:b::c:::d", S("a:b::c:::d"):
361            self.assertTypedEqual(re.findall(":+", string),
362                                  [":", "::", ":::"])
363            self.assertTypedEqual(re.findall("(:+)", string),
364                                  [":", "::", ":::"])
365            self.assertTypedEqual(re.findall("(:)(:*)", string),
366                                  [(":", ""), (":", ":"), (":", "::")])
367        for string in (b"a:b::c:::d", B(b"a:b::c:::d"), bytearray(b"a:b::c:::d"),
368                       memoryview(b"a:b::c:::d")):
369            self.assertTypedEqual(re.findall(b":+", string),
370                                  [b":", b"::", b":::"])
371            self.assertTypedEqual(re.findall(b"(:+)", string),
372                                  [b":", b"::", b":::"])
373            self.assertTypedEqual(re.findall(b"(:)(:*)", string),
374                                  [(b":", b""), (b":", b":"), (b":", b"::")])
375        for x in ("\xe0", "\u0430", "\U0001d49c"):
376            xx = x * 2
377            xxx = x * 3
378            string = "a%sb%sc%sd" % (x, xx, xxx)
379            self.assertEqual(re.findall("%s+" % x, string), [x, xx, xxx])
380            self.assertEqual(re.findall("(%s+)" % x, string), [x, xx, xxx])
381            self.assertEqual(re.findall("(%s)(%s*)" % (x, x), string),
382                             [(x, ""), (x, x), (x, xx)])
383
384    def test_bug_117612(self):
385        self.assertEqual(re.findall(r"(a|(b))", "aba"),
386                         [("a", ""),("b", "b"),("a", "")])
387
388    def test_re_match(self):
389        for string in 'a', S('a'):
390            self.assertEqual(re.match('a', string).groups(), ())
391            self.assertEqual(re.match('(a)', string).groups(), ('a',))
392            self.assertEqual(re.match('(a)', string).group(0), 'a')
393            self.assertEqual(re.match('(a)', string).group(1), 'a')
394            self.assertEqual(re.match('(a)', string).group(1, 1), ('a', 'a'))
395        for string in b'a', B(b'a'), bytearray(b'a'), memoryview(b'a'):
396            self.assertEqual(re.match(b'a', string).groups(), ())
397            self.assertEqual(re.match(b'(a)', string).groups(), (b'a',))
398            self.assertEqual(re.match(b'(a)', string).group(0), b'a')
399            self.assertEqual(re.match(b'(a)', string).group(1), b'a')
400            self.assertEqual(re.match(b'(a)', string).group(1, 1), (b'a', b'a'))
401        for a in ("\xe0", "\u0430", "\U0001d49c"):
402            self.assertEqual(re.match(a, a).groups(), ())
403            self.assertEqual(re.match('(%s)' % a, a).groups(), (a,))
404            self.assertEqual(re.match('(%s)' % a, a).group(0), a)
405            self.assertEqual(re.match('(%s)' % a, a).group(1), a)
406            self.assertEqual(re.match('(%s)' % a, a).group(1, 1), (a, a))
407
408        pat = re.compile('((a)|(b))(c)?')
409        self.assertEqual(pat.match('a').groups(), ('a', 'a', None, None))
410        self.assertEqual(pat.match('b').groups(), ('b', None, 'b', None))
411        self.assertEqual(pat.match('ac').groups(), ('a', 'a', None, 'c'))
412        self.assertEqual(pat.match('bc').groups(), ('b', None, 'b', 'c'))
413        self.assertEqual(pat.match('bc').groups(""), ('b', "", 'b', 'c'))
414
415        pat = re.compile('(?:(?P<a1>a)|(?P<b2>b))(?P<c3>c)?')
416        self.assertEqual(pat.match('a').group(1, 2, 3), ('a', None, None))
417        self.assertEqual(pat.match('b').group('a1', 'b2', 'c3'),
418                         (None, 'b', None))
419        self.assertEqual(pat.match('ac').group(1, 'b2', 3), ('a', None, 'c'))
420
421    def test_group(self):
422        class Index:
423            def __init__(self, value):
424                self.value = value
425            def __index__(self):
426                return self.value
427        # A single group
428        m = re.match('(a)(b)', 'ab')
429        self.assertEqual(m.group(), 'ab')
430        self.assertEqual(m.group(0), 'ab')
431        self.assertEqual(m.group(1), 'a')
432        self.assertEqual(m.group(Index(1)), 'a')
433        self.assertRaises(IndexError, m.group, -1)
434        self.assertRaises(IndexError, m.group, 3)
435        self.assertRaises(IndexError, m.group, 1<<1000)
436        self.assertRaises(IndexError, m.group, Index(1<<1000))
437        self.assertRaises(IndexError, m.group, 'x')
438        # Multiple groups
439        self.assertEqual(m.group(2, 1), ('b', 'a'))
440        self.assertEqual(m.group(Index(2), Index(1)), ('b', 'a'))
441
442    def test_match_getitem(self):
443        pat = re.compile('(?:(?P<a1>a)|(?P<b2>b))(?P<c3>c)?')
444
445        m = pat.match('a')
446        self.assertEqual(m['a1'], 'a')
447        self.assertEqual(m['b2'], None)
448        self.assertEqual(m['c3'], None)
449        self.assertEqual('a1={a1} b2={b2} c3={c3}'.format_map(m), 'a1=a b2=None c3=None')
450        self.assertEqual(m[0], 'a')
451        self.assertEqual(m[1], 'a')
452        self.assertEqual(m[2], None)
453        self.assertEqual(m[3], None)
454        with self.assertRaisesRegex(IndexError, 'no such group'):
455            m['X']
456        with self.assertRaisesRegex(IndexError, 'no such group'):
457            m[-1]
458        with self.assertRaisesRegex(IndexError, 'no such group'):
459            m[4]
460        with self.assertRaisesRegex(IndexError, 'no such group'):
461            m[0, 1]
462        with self.assertRaisesRegex(IndexError, 'no such group'):
463            m[(0,)]
464        with self.assertRaisesRegex(IndexError, 'no such group'):
465            m[(0, 1)]
466        with self.assertRaisesRegex(IndexError, 'no such group'):
467            'a1={a2}'.format_map(m)
468
469        m = pat.match('ac')
470        self.assertEqual(m['a1'], 'a')
471        self.assertEqual(m['b2'], None)
472        self.assertEqual(m['c3'], 'c')
473        self.assertEqual('a1={a1} b2={b2} c3={c3}'.format_map(m), 'a1=a b2=None c3=c')
474        self.assertEqual(m[0], 'ac')
475        self.assertEqual(m[1], 'a')
476        self.assertEqual(m[2], None)
477        self.assertEqual(m[3], 'c')
478
479        # Cannot assign.
480        with self.assertRaises(TypeError):
481            m[0] = 1
482
483        # No len().
484        self.assertRaises(TypeError, len, m)
485
486    def test_re_fullmatch(self):
487        # Issue 16203: Proposal: add re.fullmatch() method.
488        self.assertEqual(re.fullmatch(r"a", "a").span(), (0, 1))
489        for string in "ab", S("ab"):
490            self.assertEqual(re.fullmatch(r"a|ab", string).span(), (0, 2))
491        for string in b"ab", B(b"ab"), bytearray(b"ab"), memoryview(b"ab"):
492            self.assertEqual(re.fullmatch(br"a|ab", string).span(), (0, 2))
493        for a, b in "\xe0\xdf", "\u0430\u0431", "\U0001d49c\U0001d49e":
494            r = r"%s|%s" % (a, a + b)
495            self.assertEqual(re.fullmatch(r, a + b).span(), (0, 2))
496        self.assertEqual(re.fullmatch(r".*?$", "abc").span(), (0, 3))
497        self.assertEqual(re.fullmatch(r".*?", "abc").span(), (0, 3))
498        self.assertEqual(re.fullmatch(r"a.*?b", "ab").span(), (0, 2))
499        self.assertEqual(re.fullmatch(r"a.*?b", "abb").span(), (0, 3))
500        self.assertEqual(re.fullmatch(r"a.*?b", "axxb").span(), (0, 4))
501        self.assertIsNone(re.fullmatch(r"a+", "ab"))
502        self.assertIsNone(re.fullmatch(r"abc$", "abc\n"))
503        self.assertIsNone(re.fullmatch(r"abc\Z", "abc\n"))
504        self.assertIsNone(re.fullmatch(r"(?m)abc$", "abc\n"))
505        self.assertEqual(re.fullmatch(r"ab(?=c)cd", "abcd").span(), (0, 4))
506        self.assertEqual(re.fullmatch(r"ab(?<=b)cd", "abcd").span(), (0, 4))
507        self.assertEqual(re.fullmatch(r"(?=a|ab)ab", "ab").span(), (0, 2))
508
509        self.assertEqual(
510            re.compile(r"bc").fullmatch("abcd", pos=1, endpos=3).span(), (1, 3))
511        self.assertEqual(
512            re.compile(r".*?$").fullmatch("abcd", pos=1, endpos=3).span(), (1, 3))
513        self.assertEqual(
514            re.compile(r".*?").fullmatch("abcd", pos=1, endpos=3).span(), (1, 3))
515
516    def test_re_groupref_exists(self):
517        self.assertEqual(re.match(r'^(\()?([^()]+)(?(1)\))$', '(a)').groups(),
518                         ('(', 'a'))
519        self.assertEqual(re.match(r'^(\()?([^()]+)(?(1)\))$', 'a').groups(),
520                         (None, 'a'))
521        self.assertIsNone(re.match(r'^(\()?([^()]+)(?(1)\))$', 'a)'))
522        self.assertIsNone(re.match(r'^(\()?([^()]+)(?(1)\))$', '(a'))
523        self.assertEqual(re.match('^(?:(a)|c)((?(1)b|d))$', 'ab').groups(),
524                         ('a', 'b'))
525        self.assertEqual(re.match(r'^(?:(a)|c)((?(1)b|d))$', 'cd').groups(),
526                         (None, 'd'))
527        self.assertEqual(re.match(r'^(?:(a)|c)((?(1)|d))$', 'cd').groups(),
528                         (None, 'd'))
529        self.assertEqual(re.match(r'^(?:(a)|c)((?(1)|d))$', 'a').groups(),
530                         ('a', ''))
531
532        # Tests for bug #1177831: exercise groups other than the first group
533        p = re.compile('(?P<g1>a)(?P<g2>b)?((?(g2)c|d))')
534        self.assertEqual(p.match('abc').groups(),
535                         ('a', 'b', 'c'))
536        self.assertEqual(p.match('ad').groups(),
537                         ('a', None, 'd'))
538        self.assertIsNone(p.match('abd'))
539        self.assertIsNone(p.match('ac'))
540
541        # Support > 100 groups.
542        pat = '|'.join('x(?P<a%d>%x)y' % (i, i) for i in range(1, 200 + 1))
543        pat = '(?:%s)(?(200)z)' % pat
544        self.assertEqual(re.match(pat, 'xc8yz').span(), (0, 5))
545
546        self.checkPatternError(r'(?P<a>)(?(0))', 'bad group number', 10)
547        self.checkPatternError(r'()(?(1)a|b',
548                               'missing ), unterminated subpattern', 2)
549        self.checkPatternError(r'()(?(1)a|b|c)',
550                               'conditional backref with more than '
551                               'two branches', 10)
552
553    def test_re_groupref_overflow(self):
554        from sre_constants import MAXGROUPS
555        self.checkTemplateError('()', r'\g<%s>' % MAXGROUPS, 'xx',
556                                'invalid group reference %d' % MAXGROUPS, 3)
557        self.checkPatternError(r'(?P<a>)(?(%d))' % MAXGROUPS,
558                               'invalid group reference %d' % MAXGROUPS, 10)
559
560    def test_re_groupref(self):
561        self.assertEqual(re.match(r'^(\|)?([^()]+)\1$', '|a|').groups(),
562                         ('|', 'a'))
563        self.assertEqual(re.match(r'^(\|)?([^()]+)\1?$', 'a').groups(),
564                         (None, 'a'))
565        self.assertIsNone(re.match(r'^(\|)?([^()]+)\1$', 'a|'))
566        self.assertIsNone(re.match(r'^(\|)?([^()]+)\1$', '|a'))
567        self.assertEqual(re.match(r'^(?:(a)|c)(\1)$', 'aa').groups(),
568                         ('a', 'a'))
569        self.assertEqual(re.match(r'^(?:(a)|c)(\1)?$', 'c').groups(),
570                         (None, None))
571
572        self.checkPatternError(r'(abc\1)', 'cannot refer to an open group', 4)
573
574    def test_groupdict(self):
575        self.assertEqual(re.match('(?P<first>first) (?P<second>second)',
576                                  'first second').groupdict(),
577                         {'first':'first', 'second':'second'})
578
579    def test_expand(self):
580        self.assertEqual(re.match("(?P<first>first) (?P<second>second)",
581                                  "first second")
582                                  .expand(r"\2 \1 \g<second> \g<first>"),
583                         "second first second first")
584        self.assertEqual(re.match("(?P<first>first)|(?P<second>second)",
585                                  "first")
586                                  .expand(r"\2 \g<second>"),
587                         " ")
588
589    def test_repeat_minmax(self):
590        self.assertIsNone(re.match(r"^(\w){1}$", "abc"))
591        self.assertIsNone(re.match(r"^(\w){1}?$", "abc"))
592        self.assertIsNone(re.match(r"^(\w){1,2}$", "abc"))
593        self.assertIsNone(re.match(r"^(\w){1,2}?$", "abc"))
594
595        self.assertEqual(re.match(r"^(\w){3}$", "abc").group(1), "c")
596        self.assertEqual(re.match(r"^(\w){1,3}$", "abc").group(1), "c")
597        self.assertEqual(re.match(r"^(\w){1,4}$", "abc").group(1), "c")
598        self.assertEqual(re.match(r"^(\w){3,4}?$", "abc").group(1), "c")
599        self.assertEqual(re.match(r"^(\w){3}?$", "abc").group(1), "c")
600        self.assertEqual(re.match(r"^(\w){1,3}?$", "abc").group(1), "c")
601        self.assertEqual(re.match(r"^(\w){1,4}?$", "abc").group(1), "c")
602        self.assertEqual(re.match(r"^(\w){3,4}?$", "abc").group(1), "c")
603
604        self.assertIsNone(re.match(r"^x{1}$", "xxx"))
605        self.assertIsNone(re.match(r"^x{1}?$", "xxx"))
606        self.assertIsNone(re.match(r"^x{1,2}$", "xxx"))
607        self.assertIsNone(re.match(r"^x{1,2}?$", "xxx"))
608
609        self.assertTrue(re.match(r"^x{3}$", "xxx"))
610        self.assertTrue(re.match(r"^x{1,3}$", "xxx"))
611        self.assertTrue(re.match(r"^x{3,3}$", "xxx"))
612        self.assertTrue(re.match(r"^x{1,4}$", "xxx"))
613        self.assertTrue(re.match(r"^x{3,4}?$", "xxx"))
614        self.assertTrue(re.match(r"^x{3}?$", "xxx"))
615        self.assertTrue(re.match(r"^x{1,3}?$", "xxx"))
616        self.assertTrue(re.match(r"^x{1,4}?$", "xxx"))
617        self.assertTrue(re.match(r"^x{3,4}?$", "xxx"))
618
619        self.assertIsNone(re.match(r"^x{}$", "xxx"))
620        self.assertTrue(re.match(r"^x{}$", "x{}"))
621
622        self.checkPatternError(r'x{2,1}',
623                               'min repeat greater than max repeat', 2)
624
625    def test_getattr(self):
626        self.assertEqual(re.compile("(?i)(a)(b)").pattern, "(?i)(a)(b)")
627        self.assertEqual(re.compile("(?i)(a)(b)").flags, re.I | re.U)
628        self.assertEqual(re.compile("(?i)(a)(b)").groups, 2)
629        self.assertEqual(re.compile("(?i)(a)(b)").groupindex, {})
630        self.assertEqual(re.compile("(?i)(?P<first>a)(?P<other>b)").groupindex,
631                         {'first': 1, 'other': 2})
632
633        self.assertEqual(re.match("(a)", "a").pos, 0)
634        self.assertEqual(re.match("(a)", "a").endpos, 1)
635        self.assertEqual(re.match("(a)", "a").string, "a")
636        self.assertEqual(re.match("(a)", "a").regs, ((0, 1), (0, 1)))
637        self.assertTrue(re.match("(a)", "a").re)
638
639        # Issue 14260. groupindex should be non-modifiable mapping.
640        p = re.compile(r'(?i)(?P<first>a)(?P<other>b)')
641        self.assertEqual(sorted(p.groupindex), ['first', 'other'])
642        self.assertEqual(p.groupindex['other'], 2)
643        with self.assertRaises(TypeError):
644            p.groupindex['other'] = 0
645        self.assertEqual(p.groupindex['other'], 2)
646
647    def test_special_escapes(self):
648        self.assertEqual(re.search(r"\b(b.)\b",
649                                   "abcd abc bcd bx").group(1), "bx")
650        self.assertEqual(re.search(r"\B(b.)\B",
651                                   "abc bcd bc abxd").group(1), "bx")
652        self.assertEqual(re.search(r"\b(b.)\b",
653                                   "abcd abc bcd bx", re.ASCII).group(1), "bx")
654        self.assertEqual(re.search(r"\B(b.)\B",
655                                   "abc bcd bc abxd", re.ASCII).group(1), "bx")
656        self.assertEqual(re.search(r"^abc$", "\nabc\n", re.M).group(0), "abc")
657        self.assertEqual(re.search(r"^\Aabc\Z$", "abc", re.M).group(0), "abc")
658        self.assertIsNone(re.search(r"^\Aabc\Z$", "\nabc\n", re.M))
659        self.assertEqual(re.search(br"\b(b.)\b",
660                                   b"abcd abc bcd bx").group(1), b"bx")
661        self.assertEqual(re.search(br"\B(b.)\B",
662                                   b"abc bcd bc abxd").group(1), b"bx")
663        self.assertEqual(re.search(br"\b(b.)\b",
664                                   b"abcd abc bcd bx", re.LOCALE).group(1), b"bx")
665        self.assertEqual(re.search(br"\B(b.)\B",
666                                   b"abc bcd bc abxd", re.LOCALE).group(1), b"bx")
667        self.assertEqual(re.search(br"^abc$", b"\nabc\n", re.M).group(0), b"abc")
668        self.assertEqual(re.search(br"^\Aabc\Z$", b"abc", re.M).group(0), b"abc")
669        self.assertIsNone(re.search(br"^\Aabc\Z$", b"\nabc\n", re.M))
670        self.assertEqual(re.search(r"\d\D\w\W\s\S",
671                                   "1aa! a").group(0), "1aa! a")
672        self.assertEqual(re.search(br"\d\D\w\W\s\S",
673                                   b"1aa! a").group(0), b"1aa! a")
674        self.assertEqual(re.search(r"\d\D\w\W\s\S",
675                                   "1aa! a", re.ASCII).group(0), "1aa! a")
676        self.assertEqual(re.search(br"\d\D\w\W\s\S",
677                                   b"1aa! a", re.LOCALE).group(0), b"1aa! a")
678
679    def test_other_escapes(self):
680        self.checkPatternError("\\", 'bad escape (end of pattern)', 0)
681        self.assertEqual(re.match(r"\(", '(').group(), '(')
682        self.assertIsNone(re.match(r"\(", ')'))
683        self.assertEqual(re.match(r"\\", '\\').group(), '\\')
684        self.assertEqual(re.match(r"[\]]", ']').group(), ']')
685        self.assertIsNone(re.match(r"[\]]", '['))
686        self.assertEqual(re.match(r"[a\-c]", '-').group(), '-')
687        self.assertIsNone(re.match(r"[a\-c]", 'b'))
688        self.assertEqual(re.match(r"[\^a]+", 'a^').group(), 'a^')
689        self.assertIsNone(re.match(r"[\^a]+", 'b'))
690        re.purge()  # for warnings
691        for c in 'ceghijklmopqyzCEFGHIJKLMNOPQRTVXY':
692            with self.subTest(c):
693                self.assertRaises(re.error, re.compile, '\\%c' % c)
694        for c in 'ceghijklmopqyzABCEFGHIJKLMNOPQRTVXYZ':
695            with self.subTest(c):
696                self.assertRaises(re.error, re.compile, '[\\%c]' % c)
697
698    def test_named_unicode_escapes(self):
699        # test individual Unicode named escapes
700        self.assertTrue(re.match(r'\N{LESS-THAN SIGN}', '<'))
701        self.assertTrue(re.match(r'\N{less-than sign}', '<'))
702        self.assertIsNone(re.match(r'\N{LESS-THAN SIGN}', '>'))
703        self.assertTrue(re.match(r'\N{SNAKE}', '\U0001f40d'))
704        self.assertTrue(re.match(r'\N{ARABIC LIGATURE UIGHUR KIRGHIZ YEH WITH '
705                                 r'HAMZA ABOVE WITH ALEF MAKSURA ISOLATED FORM}',
706                                 '\ufbf9'))
707        self.assertTrue(re.match(r'[\N{LESS-THAN SIGN}-\N{GREATER-THAN SIGN}]',
708                                 '='))
709        self.assertIsNone(re.match(r'[\N{LESS-THAN SIGN}-\N{GREATER-THAN SIGN}]',
710                                   ';'))
711
712        # test errors in \N{name} handling - only valid names should pass
713        self.checkPatternError(r'\N', 'missing {', 2)
714        self.checkPatternError(r'[\N]', 'missing {', 3)
715        self.checkPatternError(r'\N{', 'missing character name', 3)
716        self.checkPatternError(r'[\N{', 'missing character name', 4)
717        self.checkPatternError(r'\N{}', 'missing character name', 3)
718        self.checkPatternError(r'[\N{}]', 'missing character name', 4)
719        self.checkPatternError(r'\NSNAKE}', 'missing {', 2)
720        self.checkPatternError(r'[\NSNAKE}]', 'missing {', 3)
721        self.checkPatternError(r'\N{SNAKE',
722                               'missing }, unterminated name', 3)
723        self.checkPatternError(r'[\N{SNAKE]',
724                               'missing }, unterminated name', 4)
725        self.checkPatternError(r'[\N{SNAKE]}',
726                               "undefined character name 'SNAKE]'", 1)
727        self.checkPatternError(r'\N{SPAM}',
728                               "undefined character name 'SPAM'", 0)
729        self.checkPatternError(r'[\N{SPAM}]',
730                               "undefined character name 'SPAM'", 1)
731        self.checkPatternError(br'\N{LESS-THAN SIGN}', r'bad escape \N', 0)
732        self.checkPatternError(br'[\N{LESS-THAN SIGN}]', r'bad escape \N', 1)
733
734    def test_string_boundaries(self):
735        # See http://bugs.python.org/issue10713
736        self.assertEqual(re.search(r"\b(abc)\b", "abc").group(1),
737                         "abc")
738        # There's a word boundary at the start of a string.
739        self.assertTrue(re.match(r"\b", "abc"))
740        # A non-empty string includes a non-boundary zero-length match.
741        self.assertTrue(re.search(r"\B", "abc"))
742        # There is no non-boundary match at the start of a string.
743        self.assertFalse(re.match(r"\B", "abc"))
744        # However, an empty string contains no word boundaries, and also no
745        # non-boundaries.
746        self.assertIsNone(re.search(r"\B", ""))
747        # This one is questionable and different from the perlre behaviour,
748        # but describes current behavior.
749        self.assertIsNone(re.search(r"\b", ""))
750        # A single word-character string has two boundaries, but no
751        # non-boundary gaps.
752        self.assertEqual(len(re.findall(r"\b", "a")), 2)
753        self.assertEqual(len(re.findall(r"\B", "a")), 0)
754        # If there are no words, there are no boundaries
755        self.assertEqual(len(re.findall(r"\b", " ")), 0)
756        self.assertEqual(len(re.findall(r"\b", "   ")), 0)
757        # Can match around the whitespace.
758        self.assertEqual(len(re.findall(r"\B", " ")), 2)
759
760    def test_bigcharset(self):
761        self.assertEqual(re.match("([\u2222\u2223])",
762                                  "\u2222").group(1), "\u2222")
763        r = '[%s]' % ''.join(map(chr, range(256, 2**16, 255)))
764        self.assertEqual(re.match(r, "\uff01").group(), "\uff01")
765
766    def test_big_codesize(self):
767        # Issue #1160
768        r = re.compile('|'.join(('%d'%x for x in range(10000))))
769        self.assertTrue(r.match('1000'))
770        self.assertTrue(r.match('9999'))
771
772    def test_anyall(self):
773        self.assertEqual(re.match("a.b", "a\nb", re.DOTALL).group(0),
774                         "a\nb")
775        self.assertEqual(re.match("a.*b", "a\n\nb", re.DOTALL).group(0),
776                         "a\n\nb")
777
778    def test_lookahead(self):
779        self.assertEqual(re.match(r"(a(?=\s[^a]))", "a b").group(1), "a")
780        self.assertEqual(re.match(r"(a(?=\s[^a]*))", "a b").group(1), "a")
781        self.assertEqual(re.match(r"(a(?=\s[abc]))", "a b").group(1), "a")
782        self.assertEqual(re.match(r"(a(?=\s[abc]*))", "a bc").group(1), "a")
783        self.assertEqual(re.match(r"(a)(?=\s\1)", "a a").group(1), "a")
784        self.assertEqual(re.match(r"(a)(?=\s\1*)", "a aa").group(1), "a")
785        self.assertEqual(re.match(r"(a)(?=\s(abc|a))", "a a").group(1), "a")
786
787        self.assertEqual(re.match(r"(a(?!\s[^a]))", "a a").group(1), "a")
788        self.assertEqual(re.match(r"(a(?!\s[abc]))", "a d").group(1), "a")
789        self.assertEqual(re.match(r"(a)(?!\s\1)", "a b").group(1), "a")
790        self.assertEqual(re.match(r"(a)(?!\s(abc|a))", "a b").group(1), "a")
791
792        # Group reference.
793        self.assertTrue(re.match(r'(a)b(?=\1)a', 'aba'))
794        self.assertIsNone(re.match(r'(a)b(?=\1)c', 'abac'))
795        # Conditional group reference.
796        self.assertTrue(re.match(r'(?:(a)|(x))b(?=(?(2)x|c))c', 'abc'))
797        self.assertIsNone(re.match(r'(?:(a)|(x))b(?=(?(2)c|x))c', 'abc'))
798        self.assertTrue(re.match(r'(?:(a)|(x))b(?=(?(2)x|c))c', 'abc'))
799        self.assertIsNone(re.match(r'(?:(a)|(x))b(?=(?(1)b|x))c', 'abc'))
800        self.assertTrue(re.match(r'(?:(a)|(x))b(?=(?(1)c|x))c', 'abc'))
801        # Group used before defined.
802        self.assertTrue(re.match(r'(a)b(?=(?(2)x|c))(c)', 'abc'))
803        self.assertIsNone(re.match(r'(a)b(?=(?(2)b|x))(c)', 'abc'))
804        self.assertTrue(re.match(r'(a)b(?=(?(1)c|x))(c)', 'abc'))
805
806    def test_lookbehind(self):
807        self.assertTrue(re.match(r'ab(?<=b)c', 'abc'))
808        self.assertIsNone(re.match(r'ab(?<=c)c', 'abc'))
809        self.assertIsNone(re.match(r'ab(?<!b)c', 'abc'))
810        self.assertTrue(re.match(r'ab(?<!c)c', 'abc'))
811        # Group reference.
812        self.assertTrue(re.match(r'(a)a(?<=\1)c', 'aac'))
813        self.assertIsNone(re.match(r'(a)b(?<=\1)a', 'abaa'))
814        self.assertIsNone(re.match(r'(a)a(?<!\1)c', 'aac'))
815        self.assertTrue(re.match(r'(a)b(?<!\1)a', 'abaa'))
816        # Conditional group reference.
817        self.assertIsNone(re.match(r'(?:(a)|(x))b(?<=(?(2)x|c))c', 'abc'))
818        self.assertIsNone(re.match(r'(?:(a)|(x))b(?<=(?(2)b|x))c', 'abc'))
819        self.assertTrue(re.match(r'(?:(a)|(x))b(?<=(?(2)x|b))c', 'abc'))
820        self.assertIsNone(re.match(r'(?:(a)|(x))b(?<=(?(1)c|x))c', 'abc'))
821        self.assertTrue(re.match(r'(?:(a)|(x))b(?<=(?(1)b|x))c', 'abc'))
822        # Group used before defined.
823        self.assertRaises(re.error, re.compile, r'(a)b(?<=(?(2)b|x))(c)')
824        self.assertIsNone(re.match(r'(a)b(?<=(?(1)c|x))(c)', 'abc'))
825        self.assertTrue(re.match(r'(a)b(?<=(?(1)b|x))(c)', 'abc'))
826        # Group defined in the same lookbehind pattern
827        self.assertRaises(re.error, re.compile, r'(a)b(?<=(.)\2)(c)')
828        self.assertRaises(re.error, re.compile, r'(a)b(?<=(?P<a>.)(?P=a))(c)')
829        self.assertRaises(re.error, re.compile, r'(a)b(?<=(a)(?(2)b|x))(c)')
830        self.assertRaises(re.error, re.compile, r'(a)b(?<=(.)(?<=\2))(c)')
831
832    def test_ignore_case(self):
833        self.assertEqual(re.match("abc", "ABC", re.I).group(0), "ABC")
834        self.assertEqual(re.match(b"abc", b"ABC", re.I).group(0), b"ABC")
835        self.assertEqual(re.match(r"(a\s[^a])", "a b", re.I).group(1), "a b")
836        self.assertEqual(re.match(r"(a\s[^a]*)", "a bb", re.I).group(1), "a bb")
837        self.assertEqual(re.match(r"(a\s[abc])", "a b", re.I).group(1), "a b")
838        self.assertEqual(re.match(r"(a\s[abc]*)", "a bb", re.I).group(1), "a bb")
839        self.assertEqual(re.match(r"((a)\s\2)", "a a", re.I).group(1), "a a")
840        self.assertEqual(re.match(r"((a)\s\2*)", "a aa", re.I).group(1), "a aa")
841        self.assertEqual(re.match(r"((a)\s(abc|a))", "a a", re.I).group(1), "a a")
842        self.assertEqual(re.match(r"((a)\s(abc|a)*)", "a aa", re.I).group(1), "a aa")
843
844        assert '\u212a'.lower() == 'k' # 'K'
845        self.assertTrue(re.match(r'K', '\u212a', re.I))
846        self.assertTrue(re.match(r'k', '\u212a', re.I))
847        self.assertTrue(re.match(r'\u212a', 'K', re.I))
848        self.assertTrue(re.match(r'\u212a', 'k', re.I))
849        assert '\u017f'.upper() == 'S' # 'ſ'
850        self.assertTrue(re.match(r'S', '\u017f', re.I))
851        self.assertTrue(re.match(r's', '\u017f', re.I))
852        self.assertTrue(re.match(r'\u017f', 'S', re.I))
853        self.assertTrue(re.match(r'\u017f', 's', re.I))
854        assert '\ufb05'.upper() == '\ufb06'.upper() == 'ST' # 'ſt', 'st'
855        self.assertTrue(re.match(r'\ufb05', '\ufb06', re.I))
856        self.assertTrue(re.match(r'\ufb06', '\ufb05', re.I))
857
858    def test_ignore_case_set(self):
859        self.assertTrue(re.match(r'[19A]', 'A', re.I))
860        self.assertTrue(re.match(r'[19a]', 'a', re.I))
861        self.assertTrue(re.match(r'[19a]', 'A', re.I))
862        self.assertTrue(re.match(r'[19A]', 'a', re.I))
863        self.assertTrue(re.match(br'[19A]', b'A', re.I))
864        self.assertTrue(re.match(br'[19a]', b'a', re.I))
865        self.assertTrue(re.match(br'[19a]', b'A', re.I))
866        self.assertTrue(re.match(br'[19A]', b'a', re.I))
867        assert '\u212a'.lower() == 'k' # 'K'
868        self.assertTrue(re.match(r'[19K]', '\u212a', re.I))
869        self.assertTrue(re.match(r'[19k]', '\u212a', re.I))
870        self.assertTrue(re.match(r'[19\u212a]', 'K', re.I))
871        self.assertTrue(re.match(r'[19\u212a]', 'k', re.I))
872        assert '\u017f'.upper() == 'S' # 'ſ'
873        self.assertTrue(re.match(r'[19S]', '\u017f', re.I))
874        self.assertTrue(re.match(r'[19s]', '\u017f', re.I))
875        self.assertTrue(re.match(r'[19\u017f]', 'S', re.I))
876        self.assertTrue(re.match(r'[19\u017f]', 's', re.I))
877        assert '\ufb05'.upper() == '\ufb06'.upper() == 'ST' # 'ſt', 'st'
878        self.assertTrue(re.match(r'[19\ufb05]', '\ufb06', re.I))
879        self.assertTrue(re.match(r'[19\ufb06]', '\ufb05', re.I))
880
881    def test_ignore_case_range(self):
882        # Issues #3511, #17381.
883        self.assertTrue(re.match(r'[9-a]', '_', re.I))
884        self.assertIsNone(re.match(r'[9-A]', '_', re.I))
885        self.assertTrue(re.match(br'[9-a]', b'_', re.I))
886        self.assertIsNone(re.match(br'[9-A]', b'_', re.I))
887        self.assertTrue(re.match(r'[\xc0-\xde]', '\xd7', re.I))
888        self.assertIsNone(re.match(r'[\xc0-\xde]', '\xf7', re.I))
889        self.assertTrue(re.match(r'[\xe0-\xfe]', '\xf7', re.I))
890        self.assertIsNone(re.match(r'[\xe0-\xfe]', '\xd7', re.I))
891        self.assertTrue(re.match(r'[\u0430-\u045f]', '\u0450', re.I))
892        self.assertTrue(re.match(r'[\u0430-\u045f]', '\u0400', re.I))
893        self.assertTrue(re.match(r'[\u0400-\u042f]', '\u0450', re.I))
894        self.assertTrue(re.match(r'[\u0400-\u042f]', '\u0400', re.I))
895        self.assertTrue(re.match(r'[\U00010428-\U0001044f]', '\U00010428', re.I))
896        self.assertTrue(re.match(r'[\U00010428-\U0001044f]', '\U00010400', re.I))
897        self.assertTrue(re.match(r'[\U00010400-\U00010427]', '\U00010428', re.I))
898        self.assertTrue(re.match(r'[\U00010400-\U00010427]', '\U00010400', re.I))
899
900        assert '\u212a'.lower() == 'k' # 'K'
901        self.assertTrue(re.match(r'[J-M]', '\u212a', re.I))
902        self.assertTrue(re.match(r'[j-m]', '\u212a', re.I))
903        self.assertTrue(re.match(r'[\u2129-\u212b]', 'K', re.I))
904        self.assertTrue(re.match(r'[\u2129-\u212b]', 'k', re.I))
905        assert '\u017f'.upper() == 'S' # 'ſ'
906        self.assertTrue(re.match(r'[R-T]', '\u017f', re.I))
907        self.assertTrue(re.match(r'[r-t]', '\u017f', re.I))
908        self.assertTrue(re.match(r'[\u017e-\u0180]', 'S', re.I))
909        self.assertTrue(re.match(r'[\u017e-\u0180]', 's', re.I))
910        assert '\ufb05'.upper() == '\ufb06'.upper() == 'ST' # 'ſt', 'st'
911        self.assertTrue(re.match(r'[\ufb04-\ufb05]', '\ufb06', re.I))
912        self.assertTrue(re.match(r'[\ufb06-\ufb07]', '\ufb05', re.I))
913
914    def test_category(self):
915        self.assertEqual(re.match(r"(\s)", " ").group(1), " ")
916
917    @cpython_only
918    def test_case_helpers(self):
919        import _sre
920        for i in range(128):
921            c = chr(i)
922            lo = ord(c.lower())
923            self.assertEqual(_sre.ascii_tolower(i), lo)
924            self.assertEqual(_sre.unicode_tolower(i), lo)
925            iscased = c in string.ascii_letters
926            self.assertEqual(_sre.ascii_iscased(i), iscased)
927            self.assertEqual(_sre.unicode_iscased(i), iscased)
928
929        for i in list(range(128, 0x1000)) + [0x10400, 0x10428]:
930            c = chr(i)
931            self.assertEqual(_sre.ascii_tolower(i), i)
932            if i != 0x0130:
933                self.assertEqual(_sre.unicode_tolower(i), ord(c.lower()))
934            iscased = c != c.lower() or c != c.upper()
935            self.assertFalse(_sre.ascii_iscased(i))
936            self.assertEqual(_sre.unicode_iscased(i),
937                             c != c.lower() or c != c.upper())
938
939        self.assertEqual(_sre.ascii_tolower(0x0130), 0x0130)
940        self.assertEqual(_sre.unicode_tolower(0x0130), ord('i'))
941        self.assertFalse(_sre.ascii_iscased(0x0130))
942        self.assertTrue(_sre.unicode_iscased(0x0130))
943
944    def test_not_literal(self):
945        self.assertEqual(re.search(r"\s([^a])", " b").group(1), "b")
946        self.assertEqual(re.search(r"\s([^a]*)", " bb").group(1), "bb")
947
948    def test_possible_set_operations(self):
949        s = bytes(range(128)).decode()
950        with self.assertWarns(FutureWarning):
951            p = re.compile(r'[0-9--1]')
952        self.assertEqual(p.findall(s), list('-./0123456789'))
953        self.assertEqual(re.findall(r'[--1]', s), list('-./01'))
954        with self.assertWarns(FutureWarning):
955            p = re.compile(r'[%--1]')
956        self.assertEqual(p.findall(s), list("%&'()*+,-1"))
957        with self.assertWarns(FutureWarning):
958            p = re.compile(r'[%--]')
959        self.assertEqual(p.findall(s), list("%&'()*+,-"))
960
961        with self.assertWarns(FutureWarning):
962            p = re.compile(r'[0-9&&1]')
963        self.assertEqual(p.findall(s), list('&0123456789'))
964        with self.assertWarns(FutureWarning):
965            p = re.compile(r'[\d&&1]')
966        self.assertEqual(p.findall(s), list('&0123456789'))
967        self.assertEqual(re.findall(r'[&&1]', s), list('&1'))
968
969        with self.assertWarns(FutureWarning):
970            p = re.compile(r'[0-9||a]')
971        self.assertEqual(p.findall(s), list('0123456789a|'))
972        with self.assertWarns(FutureWarning):
973            p = re.compile(r'[\d||a]')
974        self.assertEqual(p.findall(s), list('0123456789a|'))
975        self.assertEqual(re.findall(r'[||1]', s), list('1|'))
976
977        with self.assertWarns(FutureWarning):
978            p = re.compile(r'[0-9~~1]')
979        self.assertEqual(p.findall(s), list('0123456789~'))
980        with self.assertWarns(FutureWarning):
981            p = re.compile(r'[\d~~1]')
982        self.assertEqual(p.findall(s), list('0123456789~'))
983        self.assertEqual(re.findall(r'[~~1]', s), list('1~'))
984
985        with self.assertWarns(FutureWarning):
986            p = re.compile(r'[[0-9]|]')
987        self.assertEqual(p.findall(s), list('0123456789[]'))
988
989        with self.assertWarns(FutureWarning):
990            p = re.compile(r'[[:digit:]|]')
991        self.assertEqual(p.findall(s), list(':[]dgit'))
992
993    def test_search_coverage(self):
994        self.assertEqual(re.search(r"\s(b)", " b").group(1), "b")
995        self.assertEqual(re.search(r"a\s", "a ").group(0), "a ")
996
997    def assertMatch(self, pattern, text, match=None, span=None,
998                    matcher=re.fullmatch):
999        if match is None and span is None:
1000            # the pattern matches the whole text
1001            match = text
1002            span = (0, len(text))
1003        elif match is None or span is None:
1004            raise ValueError('If match is not None, span should be specified '
1005                             '(and vice versa).')
1006        m = matcher(pattern, text)
1007        self.assertTrue(m)
1008        self.assertEqual(m.group(), match)
1009        self.assertEqual(m.span(), span)
1010
1011    LITERAL_CHARS = string.ascii_letters + string.digits + '!"%\',/:;<=>@_`'
1012
1013    def test_re_escape(self):
1014        p = ''.join(chr(i) for i in range(256))
1015        for c in p:
1016            self.assertMatch(re.escape(c), c)
1017            self.assertMatch('[' + re.escape(c) + ']', c)
1018            self.assertMatch('(?x)' + re.escape(c), c)
1019        self.assertMatch(re.escape(p), p)
1020        for c in '-.]{}':
1021            self.assertEqual(re.escape(c)[:1], '\\')
1022        literal_chars = self.LITERAL_CHARS
1023        self.assertEqual(re.escape(literal_chars), literal_chars)
1024
1025    def test_re_escape_bytes(self):
1026        p = bytes(range(256))
1027        for i in p:
1028            b = bytes([i])
1029            self.assertMatch(re.escape(b), b)
1030            self.assertMatch(b'[' + re.escape(b) + b']', b)
1031            self.assertMatch(b'(?x)' + re.escape(b), b)
1032        self.assertMatch(re.escape(p), p)
1033        for i in b'-.]{}':
1034            b = bytes([i])
1035            self.assertEqual(re.escape(b)[:1], b'\\')
1036        literal_chars = self.LITERAL_CHARS.encode('ascii')
1037        self.assertEqual(re.escape(literal_chars), literal_chars)
1038
1039    def test_re_escape_non_ascii(self):
1040        s = 'xxx\u2620\u2620\u2620xxx'
1041        s_escaped = re.escape(s)
1042        self.assertEqual(s_escaped, s)
1043        self.assertMatch(s_escaped, s)
1044        self.assertMatch('.%s+.' % re.escape('\u2620'), s,
1045                         'x\u2620\u2620\u2620x', (2, 7), re.search)
1046
1047    def test_re_escape_non_ascii_bytes(self):
1048        b = 'y\u2620y\u2620y'.encode('utf-8')
1049        b_escaped = re.escape(b)
1050        self.assertEqual(b_escaped, b)
1051        self.assertMatch(b_escaped, b)
1052        res = re.findall(re.escape('\u2620'.encode('utf-8')), b)
1053        self.assertEqual(len(res), 2)
1054
1055    def test_pickling(self):
1056        import pickle
1057        oldpat = re.compile('a(?:b|(c|e){1,2}?|d)+?(.)', re.UNICODE)
1058        for proto in range(pickle.HIGHEST_PROTOCOL + 1):
1059            pickled = pickle.dumps(oldpat, proto)
1060            newpat = pickle.loads(pickled)
1061            self.assertEqual(newpat, oldpat)
1062        # current pickle expects the _compile() reconstructor in re module
1063        from re import _compile
1064
1065    def test_copying(self):
1066        import copy
1067        p = re.compile(r'(?P<int>\d+)(?:\.(?P<frac>\d*))?')
1068        self.assertIs(copy.copy(p), p)
1069        self.assertIs(copy.deepcopy(p), p)
1070        m = p.match('12.34')
1071        self.assertIs(copy.copy(m), m)
1072        self.assertIs(copy.deepcopy(m), m)
1073
1074    def test_constants(self):
1075        self.assertEqual(re.I, re.IGNORECASE)
1076        self.assertEqual(re.L, re.LOCALE)
1077        self.assertEqual(re.M, re.MULTILINE)
1078        self.assertEqual(re.S, re.DOTALL)
1079        self.assertEqual(re.X, re.VERBOSE)
1080
1081    def test_flags(self):
1082        for flag in [re.I, re.M, re.X, re.S, re.A, re.U]:
1083            self.assertTrue(re.compile('^pattern$', flag))
1084        for flag in [re.I, re.M, re.X, re.S, re.A, re.L]:
1085            self.assertTrue(re.compile(b'^pattern$', flag))
1086
1087    def test_sre_character_literals(self):
1088        for i in [0, 8, 16, 32, 64, 127, 128, 255, 256, 0xFFFF, 0x10000, 0x10FFFF]:
1089            if i < 256:
1090                self.assertTrue(re.match(r"\%03o" % i, chr(i)))
1091                self.assertTrue(re.match(r"\%03o0" % i, chr(i)+"0"))
1092                self.assertTrue(re.match(r"\%03o8" % i, chr(i)+"8"))
1093                self.assertTrue(re.match(r"\x%02x" % i, chr(i)))
1094                self.assertTrue(re.match(r"\x%02x0" % i, chr(i)+"0"))
1095                self.assertTrue(re.match(r"\x%02xz" % i, chr(i)+"z"))
1096            if i < 0x10000:
1097                self.assertTrue(re.match(r"\u%04x" % i, chr(i)))
1098                self.assertTrue(re.match(r"\u%04x0" % i, chr(i)+"0"))
1099                self.assertTrue(re.match(r"\u%04xz" % i, chr(i)+"z"))
1100            self.assertTrue(re.match(r"\U%08x" % i, chr(i)))
1101            self.assertTrue(re.match(r"\U%08x0" % i, chr(i)+"0"))
1102            self.assertTrue(re.match(r"\U%08xz" % i, chr(i)+"z"))
1103        self.assertTrue(re.match(r"\0", "\000"))
1104        self.assertTrue(re.match(r"\08", "\0008"))
1105        self.assertTrue(re.match(r"\01", "\001"))
1106        self.assertTrue(re.match(r"\018", "\0018"))
1107        self.checkPatternError(r"\567",
1108                               r'octal escape value \567 outside of '
1109                               r'range 0-0o377', 0)
1110        self.checkPatternError(r"\911", 'invalid group reference 91', 1)
1111        self.checkPatternError(r"\x1", r'incomplete escape \x1', 0)
1112        self.checkPatternError(r"\x1z", r'incomplete escape \x1', 0)
1113        self.checkPatternError(r"\u123", r'incomplete escape \u123', 0)
1114        self.checkPatternError(r"\u123z", r'incomplete escape \u123', 0)
1115        self.checkPatternError(r"\U0001234", r'incomplete escape \U0001234', 0)
1116        self.checkPatternError(r"\U0001234z", r'incomplete escape \U0001234', 0)
1117        self.checkPatternError(r"\U00110000", r'bad escape \U00110000', 0)
1118
1119    def test_sre_character_class_literals(self):
1120        for i in [0, 8, 16, 32, 64, 127, 128, 255, 256, 0xFFFF, 0x10000, 0x10FFFF]:
1121            if i < 256:
1122                self.assertTrue(re.match(r"[\%o]" % i, chr(i)))
1123                self.assertTrue(re.match(r"[\%o8]" % i, chr(i)))
1124                self.assertTrue(re.match(r"[\%03o]" % i, chr(i)))
1125                self.assertTrue(re.match(r"[\%03o0]" % i, chr(i)))
1126                self.assertTrue(re.match(r"[\%03o8]" % i, chr(i)))
1127                self.assertTrue(re.match(r"[\x%02x]" % i, chr(i)))
1128                self.assertTrue(re.match(r"[\x%02x0]" % i, chr(i)))
1129                self.assertTrue(re.match(r"[\x%02xz]" % i, chr(i)))
1130            if i < 0x10000:
1131                self.assertTrue(re.match(r"[\u%04x]" % i, chr(i)))
1132                self.assertTrue(re.match(r"[\u%04x0]" % i, chr(i)))
1133                self.assertTrue(re.match(r"[\u%04xz]" % i, chr(i)))
1134            self.assertTrue(re.match(r"[\U%08x]" % i, chr(i)))
1135            self.assertTrue(re.match(r"[\U%08x0]" % i, chr(i)+"0"))
1136            self.assertTrue(re.match(r"[\U%08xz]" % i, chr(i)+"z"))
1137        self.checkPatternError(r"[\567]",
1138                               r'octal escape value \567 outside of '
1139                               r'range 0-0o377', 1)
1140        self.checkPatternError(r"[\911]", r'bad escape \9', 1)
1141        self.checkPatternError(r"[\x1z]", r'incomplete escape \x1', 1)
1142        self.checkPatternError(r"[\u123z]", r'incomplete escape \u123', 1)
1143        self.checkPatternError(r"[\U0001234z]", r'incomplete escape \U0001234', 1)
1144        self.checkPatternError(r"[\U00110000]", r'bad escape \U00110000', 1)
1145        self.assertTrue(re.match(r"[\U0001d49c-\U0001d4b5]", "\U0001d49e"))
1146
1147    def test_sre_byte_literals(self):
1148        for i in [0, 8, 16, 32, 64, 127, 128, 255]:
1149            self.assertTrue(re.match((r"\%03o" % i).encode(), bytes([i])))
1150            self.assertTrue(re.match((r"\%03o0" % i).encode(), bytes([i])+b"0"))
1151            self.assertTrue(re.match((r"\%03o8" % i).encode(), bytes([i])+b"8"))
1152            self.assertTrue(re.match((r"\x%02x" % i).encode(), bytes([i])))
1153            self.assertTrue(re.match((r"\x%02x0" % i).encode(), bytes([i])+b"0"))
1154            self.assertTrue(re.match((r"\x%02xz" % i).encode(), bytes([i])+b"z"))
1155        self.assertRaises(re.error, re.compile, br"\u1234")
1156        self.assertRaises(re.error, re.compile, br"\U00012345")
1157        self.assertTrue(re.match(br"\0", b"\000"))
1158        self.assertTrue(re.match(br"\08", b"\0008"))
1159        self.assertTrue(re.match(br"\01", b"\001"))
1160        self.assertTrue(re.match(br"\018", b"\0018"))
1161        self.checkPatternError(br"\567",
1162                               r'octal escape value \567 outside of '
1163                               r'range 0-0o377', 0)
1164        self.checkPatternError(br"\911", 'invalid group reference 91', 1)
1165        self.checkPatternError(br"\x1", r'incomplete escape \x1', 0)
1166        self.checkPatternError(br"\x1z", r'incomplete escape \x1', 0)
1167
1168    def test_sre_byte_class_literals(self):
1169        for i in [0, 8, 16, 32, 64, 127, 128, 255]:
1170            self.assertTrue(re.match((r"[\%o]" % i).encode(), bytes([i])))
1171            self.assertTrue(re.match((r"[\%o8]" % i).encode(), bytes([i])))
1172            self.assertTrue(re.match((r"[\%03o]" % i).encode(), bytes([i])))
1173            self.assertTrue(re.match((r"[\%03o0]" % i).encode(), bytes([i])))
1174            self.assertTrue(re.match((r"[\%03o8]" % i).encode(), bytes([i])))
1175            self.assertTrue(re.match((r"[\x%02x]" % i).encode(), bytes([i])))
1176            self.assertTrue(re.match((r"[\x%02x0]" % i).encode(), bytes([i])))
1177            self.assertTrue(re.match((r"[\x%02xz]" % i).encode(), bytes([i])))
1178        self.assertRaises(re.error, re.compile, br"[\u1234]")
1179        self.assertRaises(re.error, re.compile, br"[\U00012345]")
1180        self.checkPatternError(br"[\567]",
1181                               r'octal escape value \567 outside of '
1182                               r'range 0-0o377', 1)
1183        self.checkPatternError(br"[\911]", r'bad escape \9', 1)
1184        self.checkPatternError(br"[\x1z]", r'incomplete escape \x1', 1)
1185
1186    def test_character_set_errors(self):
1187        self.checkPatternError(r'[', 'unterminated character set', 0)
1188        self.checkPatternError(r'[^', 'unterminated character set', 0)
1189        self.checkPatternError(r'[a', 'unterminated character set', 0)
1190        # bug 545855 -- This pattern failed to cause a compile error as it
1191        # should, instead provoking a TypeError.
1192        self.checkPatternError(r"[a-", 'unterminated character set', 0)
1193        self.checkPatternError(r"[\w-b]", r'bad character range \w-b', 1)
1194        self.checkPatternError(r"[a-\w]", r'bad character range a-\w', 1)
1195        self.checkPatternError(r"[b-a]", 'bad character range b-a', 1)
1196
1197    def test_bug_113254(self):
1198        self.assertEqual(re.match(r'(a)|(b)', 'b').start(1), -1)
1199        self.assertEqual(re.match(r'(a)|(b)', 'b').end(1), -1)
1200        self.assertEqual(re.match(r'(a)|(b)', 'b').span(1), (-1, -1))
1201
1202    def test_bug_527371(self):
1203        # bug described in patches 527371/672491
1204        self.assertIsNone(re.match(r'(a)?a','a').lastindex)
1205        self.assertEqual(re.match(r'(a)(b)?b','ab').lastindex, 1)
1206        self.assertEqual(re.match(r'(?P<a>a)(?P<b>b)?b','ab').lastgroup, 'a')
1207        self.assertEqual(re.match(r"(?P<a>a(b))", "ab").lastgroup, 'a')
1208        self.assertEqual(re.match(r"((a))", "a").lastindex, 1)
1209
1210    def test_bug_418626(self):
1211        # bugs 418626 at al. -- Testing Greg Chapman's addition of op code
1212        # SRE_OP_MIN_REPEAT_ONE for eliminating recursion on simple uses of
1213        # pattern '*?' on a long string.
1214        self.assertEqual(re.match('.*?c', 10000*'ab'+'cd').end(0), 20001)
1215        self.assertEqual(re.match('.*?cd', 5000*'ab'+'c'+5000*'ab'+'cde').end(0),
1216                         20003)
1217        self.assertEqual(re.match('.*?cd', 20000*'abc'+'de').end(0), 60001)
1218        # non-simple '*?' still used to hit the recursion limit, before the
1219        # non-recursive scheme was implemented.
1220        self.assertEqual(re.search('(a|b)*?c', 10000*'ab'+'cd').end(0), 20001)
1221
1222    def test_bug_612074(self):
1223        pat="["+re.escape("\u2039")+"]"
1224        self.assertEqual(re.compile(pat) and 1, 1)
1225
1226    def test_stack_overflow(self):
1227        # nasty cases that used to overflow the straightforward recursive
1228        # implementation of repeated groups.
1229        self.assertEqual(re.match('(x)*', 50000*'x').group(1), 'x')
1230        self.assertEqual(re.match('(x)*y', 50000*'x'+'y').group(1), 'x')
1231        self.assertEqual(re.match('(x)*?y', 50000*'x'+'y').group(1), 'x')
1232
1233    def test_nothing_to_repeat(self):
1234        for reps in '*', '+', '?', '{1,2}':
1235            for mod in '', '?':
1236                self.checkPatternError('%s%s' % (reps, mod),
1237                                       'nothing to repeat', 0)
1238                self.checkPatternError('(?:%s%s)' % (reps, mod),
1239                                       'nothing to repeat', 3)
1240
1241    def test_multiple_repeat(self):
1242        for outer_reps in '*', '+', '{1,2}':
1243            for outer_mod in '', '?':
1244                outer_op = outer_reps + outer_mod
1245                for inner_reps in '*', '+', '?', '{1,2}':
1246                    for inner_mod in '', '?':
1247                        inner_op = inner_reps + inner_mod
1248                        self.checkPatternError(r'x%s%s' % (inner_op, outer_op),
1249                                'multiple repeat', 1 + len(inner_op))
1250
1251    def test_unlimited_zero_width_repeat(self):
1252        # Issue #9669
1253        self.assertIsNone(re.match(r'(?:a?)*y', 'z'))
1254        self.assertIsNone(re.match(r'(?:a?)+y', 'z'))
1255        self.assertIsNone(re.match(r'(?:a?){2,}y', 'z'))
1256        self.assertIsNone(re.match(r'(?:a?)*?y', 'z'))
1257        self.assertIsNone(re.match(r'(?:a?)+?y', 'z'))
1258        self.assertIsNone(re.match(r'(?:a?){2,}?y', 'z'))
1259
1260    def test_scanner(self):
1261        def s_ident(scanner, token): return token
1262        def s_operator(scanner, token): return "op%s" % token
1263        def s_float(scanner, token): return float(token)
1264        def s_int(scanner, token): return int(token)
1265
1266        scanner = Scanner([
1267            (r"[a-zA-Z_]\w*", s_ident),
1268            (r"\d+\.\d*", s_float),
1269            (r"\d+", s_int),
1270            (r"=|\+|-|\*|/", s_operator),
1271            (r"\s+", None),
1272            ])
1273
1274        self.assertTrue(scanner.scanner.scanner("").pattern)
1275
1276        self.assertEqual(scanner.scan("sum = 3*foo + 312.50 + bar"),
1277                         (['sum', 'op=', 3, 'op*', 'foo', 'op+', 312.5,
1278                           'op+', 'bar'], ''))
1279
1280    def test_bug_448951(self):
1281        # bug 448951 (similar to 429357, but with single char match)
1282        # (Also test greedy matches.)
1283        for op in '','?','*':
1284            self.assertEqual(re.match(r'((.%s):)?z'%op, 'z').groups(),
1285                             (None, None))
1286            self.assertEqual(re.match(r'((.%s):)?z'%op, 'a:z').groups(),
1287                             ('a:', 'a'))
1288
1289    def test_bug_725106(self):
1290        # capturing groups in alternatives in repeats
1291        self.assertEqual(re.match('^((a)|b)*', 'abc').groups(),
1292                         ('b', 'a'))
1293        self.assertEqual(re.match('^(([ab])|c)*', 'abc').groups(),
1294                         ('c', 'b'))
1295        self.assertEqual(re.match('^((d)|[ab])*', 'abc').groups(),
1296                         ('b', None))
1297        self.assertEqual(re.match('^((a)c|[ab])*', 'abc').groups(),
1298                         ('b', None))
1299        self.assertEqual(re.match('^((a)|b)*?c', 'abc').groups(),
1300                         ('b', 'a'))
1301        self.assertEqual(re.match('^(([ab])|c)*?d', 'abcd').groups(),
1302                         ('c', 'b'))
1303        self.assertEqual(re.match('^((d)|[ab])*?c', 'abc').groups(),
1304                         ('b', None))
1305        self.assertEqual(re.match('^((a)c|[ab])*?c', 'abc').groups(),
1306                         ('b', None))
1307
1308    def test_bug_725149(self):
1309        # mark_stack_base restoring before restoring marks
1310        self.assertEqual(re.match('(a)(?:(?=(b)*)c)*', 'abb').groups(),
1311                         ('a', None))
1312        self.assertEqual(re.match('(a)((?!(b)*))*', 'abb').groups(),
1313                         ('a', None, None))
1314
1315    def test_bug_764548(self):
1316        # bug 764548, re.compile() barfs on str/unicode subclasses
1317        class my_unicode(str): pass
1318        pat = re.compile(my_unicode("abc"))
1319        self.assertIsNone(pat.match("xyz"))
1320
1321    def test_finditer(self):
1322        iter = re.finditer(r":+", "a:b::c:::d")
1323        self.assertEqual([item.group(0) for item in iter],
1324                         [":", "::", ":::"])
1325
1326        pat = re.compile(r":+")
1327        iter = pat.finditer("a:b::c:::d", 1, 10)
1328        self.assertEqual([item.group(0) for item in iter],
1329                         [":", "::", ":::"])
1330
1331        pat = re.compile(r":+")
1332        iter = pat.finditer("a:b::c:::d", pos=1, endpos=10)
1333        self.assertEqual([item.group(0) for item in iter],
1334                         [":", "::", ":::"])
1335
1336        pat = re.compile(r":+")
1337        iter = pat.finditer("a:b::c:::d", endpos=10, pos=1)
1338        self.assertEqual([item.group(0) for item in iter],
1339                         [":", "::", ":::"])
1340
1341        pat = re.compile(r":+")
1342        iter = pat.finditer("a:b::c:::d", pos=3, endpos=8)
1343        self.assertEqual([item.group(0) for item in iter],
1344                         ["::", "::"])
1345
1346    def test_bug_926075(self):
1347        self.assertIsNot(re.compile('bug_926075'),
1348                         re.compile(b'bug_926075'))
1349
1350    def test_bug_931848(self):
1351        pattern = "[\u002E\u3002\uFF0E\uFF61]"
1352        self.assertEqual(re.compile(pattern).split("a.b.c"),
1353                         ['a','b','c'])
1354
1355    def test_bug_581080(self):
1356        iter = re.finditer(r"\s", "a b")
1357        self.assertEqual(next(iter).span(), (1,2))
1358        self.assertRaises(StopIteration, next, iter)
1359
1360        scanner = re.compile(r"\s").scanner("a b")
1361        self.assertEqual(scanner.search().span(), (1, 2))
1362        self.assertIsNone(scanner.search())
1363
1364    def test_bug_817234(self):
1365        iter = re.finditer(r".*", "asdf")
1366        self.assertEqual(next(iter).span(), (0, 4))
1367        self.assertEqual(next(iter).span(), (4, 4))
1368        self.assertRaises(StopIteration, next, iter)
1369
1370    def test_bug_6561(self):
1371        # '\d' should match characters in Unicode category 'Nd'
1372        # (Number, Decimal Digit), but not those in 'Nl' (Number,
1373        # Letter) or 'No' (Number, Other).
1374        decimal_digits = [
1375            '\u0037', # '\N{DIGIT SEVEN}', category 'Nd'
1376            '\u0e58', # '\N{THAI DIGIT SIX}', category 'Nd'
1377            '\uff10', # '\N{FULLWIDTH DIGIT ZERO}', category 'Nd'
1378            ]
1379        for x in decimal_digits:
1380            self.assertEqual(re.match(r'^\d$', x).group(0), x)
1381
1382        not_decimal_digits = [
1383            '\u2165', # '\N{ROMAN NUMERAL SIX}', category 'Nl'
1384            '\u3039', # '\N{HANGZHOU NUMERAL TWENTY}', category 'Nl'
1385            '\u2082', # '\N{SUBSCRIPT TWO}', category 'No'
1386            '\u32b4', # '\N{CIRCLED NUMBER THIRTY NINE}', category 'No'
1387            ]
1388        for x in not_decimal_digits:
1389            self.assertIsNone(re.match(r'^\d$', x))
1390
1391    def test_empty_array(self):
1392        # SF buf 1647541
1393        import array
1394        for typecode in 'bBuhHiIlLfd':
1395            a = array.array(typecode)
1396            self.assertIsNone(re.compile(b"bla").match(a))
1397            self.assertEqual(re.compile(b"").match(a).groups(), ())
1398
1399    def test_inline_flags(self):
1400        # Bug #1700
1401        upper_char = '\u1ea0' # Latin Capital Letter A with Dot Below
1402        lower_char = '\u1ea1' # Latin Small Letter A with Dot Below
1403
1404        p = re.compile('.' + upper_char, re.I | re.S)
1405        q = p.match('\n' + lower_char)
1406        self.assertTrue(q)
1407
1408        p = re.compile('.' + lower_char, re.I | re.S)
1409        q = p.match('\n' + upper_char)
1410        self.assertTrue(q)
1411
1412        p = re.compile('(?i).' + upper_char, re.S)
1413        q = p.match('\n' + lower_char)
1414        self.assertTrue(q)
1415
1416        p = re.compile('(?i).' + lower_char, re.S)
1417        q = p.match('\n' + upper_char)
1418        self.assertTrue(q)
1419
1420        p = re.compile('(?is).' + upper_char)
1421        q = p.match('\n' + lower_char)
1422        self.assertTrue(q)
1423
1424        p = re.compile('(?is).' + lower_char)
1425        q = p.match('\n' + upper_char)
1426        self.assertTrue(q)
1427
1428        p = re.compile('(?s)(?i).' + upper_char)
1429        q = p.match('\n' + lower_char)
1430        self.assertTrue(q)
1431
1432        p = re.compile('(?s)(?i).' + lower_char)
1433        q = p.match('\n' + upper_char)
1434        self.assertTrue(q)
1435
1436        self.assertTrue(re.match('(?ix) ' + upper_char, lower_char))
1437        self.assertTrue(re.match('(?ix) ' + lower_char, upper_char))
1438        self.assertTrue(re.match(' (?i) ' + upper_char, lower_char, re.X))
1439        self.assertTrue(re.match('(?x) (?i) ' + upper_char, lower_char))
1440        self.assertTrue(re.match(' (?x) (?i) ' + upper_char, lower_char, re.X))
1441
1442        p = upper_char + '(?i)'
1443        with self.assertWarns(DeprecationWarning) as warns:
1444            self.assertTrue(re.match(p, lower_char))
1445        self.assertEqual(
1446            str(warns.warnings[0].message),
1447            'Flags not at the start of the expression %r' % p
1448        )
1449        self.assertEqual(warns.warnings[0].filename, __file__)
1450
1451        p = upper_char + '(?i)%s' % ('.?' * 100)
1452        with self.assertWarns(DeprecationWarning) as warns:
1453            self.assertTrue(re.match(p, lower_char))
1454        self.assertEqual(
1455            str(warns.warnings[0].message),
1456            'Flags not at the start of the expression %r (truncated)' % p[:20]
1457        )
1458        self.assertEqual(warns.warnings[0].filename, __file__)
1459
1460        # bpo-30605: Compiling a bytes instance regex was throwing a BytesWarning
1461        with warnings.catch_warnings():
1462            warnings.simplefilter('error', BytesWarning)
1463            p = b'A(?i)'
1464            with self.assertWarns(DeprecationWarning) as warns:
1465                self.assertTrue(re.match(p, b'a'))
1466            self.assertEqual(
1467                str(warns.warnings[0].message),
1468                'Flags not at the start of the expression %r' % p
1469            )
1470            self.assertEqual(warns.warnings[0].filename, __file__)
1471
1472        with self.assertWarns(DeprecationWarning):
1473            self.assertTrue(re.match('(?s).(?i)' + upper_char, '\n' + lower_char))
1474        with self.assertWarns(DeprecationWarning):
1475            self.assertTrue(re.match('(?i) ' + upper_char + ' (?x)', lower_char))
1476        with self.assertWarns(DeprecationWarning):
1477            self.assertTrue(re.match(' (?x) (?i) ' + upper_char, lower_char))
1478        with self.assertWarns(DeprecationWarning):
1479            self.assertTrue(re.match('^(?i)' + upper_char, lower_char))
1480        with self.assertWarns(DeprecationWarning):
1481            self.assertTrue(re.match('$|(?i)' + upper_char, lower_char))
1482        with self.assertWarns(DeprecationWarning) as warns:
1483            self.assertTrue(re.match('(?:(?i)' + upper_char + ')', lower_char))
1484        self.assertRegex(str(warns.warnings[0].message),
1485                         'Flags not at the start')
1486        self.assertEqual(warns.warnings[0].filename, __file__)
1487        with self.assertWarns(DeprecationWarning) as warns:
1488            self.assertTrue(re.fullmatch('(^)?(?(1)(?i)' + upper_char + ')',
1489                                         lower_char))
1490        self.assertRegex(str(warns.warnings[0].message),
1491                         'Flags not at the start')
1492        self.assertEqual(warns.warnings[0].filename, __file__)
1493        with self.assertWarns(DeprecationWarning) as warns:
1494            self.assertTrue(re.fullmatch('($)?(?(1)|(?i)' + upper_char + ')',
1495                                         lower_char))
1496        self.assertRegex(str(warns.warnings[0].message),
1497                         'Flags not at the start')
1498        self.assertEqual(warns.warnings[0].filename, __file__)
1499
1500
1501    def test_dollar_matches_twice(self):
1502        "$ matches the end of string, and just before the terminating \n"
1503        pattern = re.compile('$')
1504        self.assertEqual(pattern.sub('#', 'a\nb\n'), 'a\nb#\n#')
1505        self.assertEqual(pattern.sub('#', 'a\nb\nc'), 'a\nb\nc#')
1506        self.assertEqual(pattern.sub('#', '\n'), '#\n#')
1507
1508        pattern = re.compile('$', re.MULTILINE)
1509        self.assertEqual(pattern.sub('#', 'a\nb\n' ), 'a#\nb#\n#' )
1510        self.assertEqual(pattern.sub('#', 'a\nb\nc'), 'a#\nb#\nc#')
1511        self.assertEqual(pattern.sub('#', '\n'), '#\n#')
1512
1513    def test_bytes_str_mixing(self):
1514        # Mixing str and bytes is disallowed
1515        pat = re.compile('.')
1516        bpat = re.compile(b'.')
1517        self.assertRaises(TypeError, pat.match, b'b')
1518        self.assertRaises(TypeError, bpat.match, 'b')
1519        self.assertRaises(TypeError, pat.sub, b'b', 'c')
1520        self.assertRaises(TypeError, pat.sub, 'b', b'c')
1521        self.assertRaises(TypeError, pat.sub, b'b', b'c')
1522        self.assertRaises(TypeError, bpat.sub, b'b', 'c')
1523        self.assertRaises(TypeError, bpat.sub, 'b', b'c')
1524        self.assertRaises(TypeError, bpat.sub, 'b', 'c')
1525
1526    def test_ascii_and_unicode_flag(self):
1527        # String patterns
1528        for flags in (0, re.UNICODE):
1529            pat = re.compile('\xc0', flags | re.IGNORECASE)
1530            self.assertTrue(pat.match('\xe0'))
1531            pat = re.compile(r'\w', flags)
1532            self.assertTrue(pat.match('\xe0'))
1533        pat = re.compile('\xc0', re.ASCII | re.IGNORECASE)
1534        self.assertIsNone(pat.match('\xe0'))
1535        pat = re.compile('(?a)\xc0', re.IGNORECASE)
1536        self.assertIsNone(pat.match('\xe0'))
1537        pat = re.compile(r'\w', re.ASCII)
1538        self.assertIsNone(pat.match('\xe0'))
1539        pat = re.compile(r'(?a)\w')
1540        self.assertIsNone(pat.match('\xe0'))
1541        # Bytes patterns
1542        for flags in (0, re.ASCII):
1543            pat = re.compile(b'\xc0', flags | re.IGNORECASE)
1544            self.assertIsNone(pat.match(b'\xe0'))
1545            pat = re.compile(br'\w', flags)
1546            self.assertIsNone(pat.match(b'\xe0'))
1547        # Incompatibilities
1548        self.assertRaises(ValueError, re.compile, br'\w', re.UNICODE)
1549        self.assertRaises(re.error, re.compile, br'(?u)\w')
1550        self.assertRaises(ValueError, re.compile, r'\w', re.UNICODE | re.ASCII)
1551        self.assertRaises(ValueError, re.compile, r'(?u)\w', re.ASCII)
1552        self.assertRaises(ValueError, re.compile, r'(?a)\w', re.UNICODE)
1553        self.assertRaises(re.error, re.compile, r'(?au)\w')
1554
1555    def test_locale_flag(self):
1556        enc = locale.getpreferredencoding()
1557        # Search non-ASCII letter
1558        for i in range(128, 256):
1559            try:
1560                c = bytes([i]).decode(enc)
1561                sletter = c.lower()
1562                if sletter == c: continue
1563                bletter = sletter.encode(enc)
1564                if len(bletter) != 1: continue
1565                if bletter.decode(enc) != sletter: continue
1566                bpat = re.escape(bytes([i]))
1567                break
1568            except (UnicodeError, TypeError):
1569                pass
1570        else:
1571            bletter = None
1572            bpat = b'A'
1573        # Bytes patterns
1574        pat = re.compile(bpat, re.LOCALE | re.IGNORECASE)
1575        if bletter:
1576            self.assertTrue(pat.match(bletter))
1577        pat = re.compile(b'(?L)' + bpat, re.IGNORECASE)
1578        if bletter:
1579            self.assertTrue(pat.match(bletter))
1580        pat = re.compile(bpat, re.IGNORECASE)
1581        if bletter:
1582            self.assertIsNone(pat.match(bletter))
1583        pat = re.compile(br'\w', re.LOCALE)
1584        if bletter:
1585            self.assertTrue(pat.match(bletter))
1586        pat = re.compile(br'(?L)\w')
1587        if bletter:
1588            self.assertTrue(pat.match(bletter))
1589        pat = re.compile(br'\w')
1590        if bletter:
1591            self.assertIsNone(pat.match(bletter))
1592        # Incompatibilities
1593        self.assertRaises(ValueError, re.compile, '', re.LOCALE)
1594        self.assertRaises(re.error, re.compile, '(?L)')
1595        self.assertRaises(ValueError, re.compile, b'', re.LOCALE | re.ASCII)
1596        self.assertRaises(ValueError, re.compile, b'(?L)', re.ASCII)
1597        self.assertRaises(ValueError, re.compile, b'(?a)', re.LOCALE)
1598        self.assertRaises(re.error, re.compile, b'(?aL)')
1599
1600    def test_scoped_flags(self):
1601        self.assertTrue(re.match(r'(?i:a)b', 'Ab'))
1602        self.assertIsNone(re.match(r'(?i:a)b', 'aB'))
1603        self.assertIsNone(re.match(r'(?-i:a)b', 'Ab', re.IGNORECASE))
1604        self.assertTrue(re.match(r'(?-i:a)b', 'aB', re.IGNORECASE))
1605        self.assertIsNone(re.match(r'(?i:(?-i:a)b)', 'Ab'))
1606        self.assertTrue(re.match(r'(?i:(?-i:a)b)', 'aB'))
1607
1608        self.assertTrue(re.match(r'(?x: a) b', 'a b'))
1609        self.assertIsNone(re.match(r'(?x: a) b', ' a b'))
1610        self.assertTrue(re.match(r'(?-x: a) b', ' ab', re.VERBOSE))
1611        self.assertIsNone(re.match(r'(?-x: a) b', 'ab', re.VERBOSE))
1612
1613        self.assertTrue(re.match(r'\w(?a:\W)\w', '\xe0\xe0\xe0'))
1614        self.assertTrue(re.match(r'(?a:\W(?u:\w)\W)', '\xe0\xe0\xe0'))
1615        self.assertTrue(re.match(r'\W(?u:\w)\W', '\xe0\xe0\xe0', re.ASCII))
1616
1617        self.checkPatternError(r'(?a)(?-a:\w)',
1618                "bad inline flags: cannot turn off flags 'a', 'u' and 'L'", 8)
1619        self.checkPatternError(r'(?i-i:a)',
1620                'bad inline flags: flag turned on and off', 5)
1621        self.checkPatternError(r'(?au:a)',
1622                "bad inline flags: flags 'a', 'u' and 'L' are incompatible", 4)
1623        self.checkPatternError(br'(?aL:a)',
1624                "bad inline flags: flags 'a', 'u' and 'L' are incompatible", 4)
1625
1626        self.checkPatternError(r'(?-', 'missing flag', 3)
1627        self.checkPatternError(r'(?-+', 'missing flag', 3)
1628        self.checkPatternError(r'(?-z', 'unknown flag', 3)
1629        self.checkPatternError(r'(?-i', 'missing :', 4)
1630        self.checkPatternError(r'(?-i)', 'missing :', 4)
1631        self.checkPatternError(r'(?-i+', 'missing :', 4)
1632        self.checkPatternError(r'(?-iz', 'unknown flag', 4)
1633        self.checkPatternError(r'(?i:', 'missing ), unterminated subpattern', 0)
1634        self.checkPatternError(r'(?i', 'missing -, : or )', 3)
1635        self.checkPatternError(r'(?i+', 'missing -, : or )', 3)
1636        self.checkPatternError(r'(?iz', 'unknown flag', 3)
1637
1638    def test_bug_6509(self):
1639        # Replacement strings of both types must parse properly.
1640        # all strings
1641        pat = re.compile(r'a(\w)')
1642        self.assertEqual(pat.sub('b\\1', 'ac'), 'bc')
1643        pat = re.compile('a(.)')
1644        self.assertEqual(pat.sub('b\\1', 'a\u1234'), 'b\u1234')
1645        pat = re.compile('..')
1646        self.assertEqual(pat.sub(lambda m: 'str', 'a5'), 'str')
1647
1648        # all bytes
1649        pat = re.compile(br'a(\w)')
1650        self.assertEqual(pat.sub(b'b\\1', b'ac'), b'bc')
1651        pat = re.compile(b'a(.)')
1652        self.assertEqual(pat.sub(b'b\\1', b'a\xCD'), b'b\xCD')
1653        pat = re.compile(b'..')
1654        self.assertEqual(pat.sub(lambda m: b'bytes', b'a5'), b'bytes')
1655
1656    def test_dealloc(self):
1657        # issue 3299: check for segfault in debug build
1658        import _sre
1659        # the overflow limit is different on wide and narrow builds and it
1660        # depends on the definition of SRE_CODE (see sre.h).
1661        # 2**128 should be big enough to overflow on both. For smaller values
1662        # a RuntimeError is raised instead of OverflowError.
1663        long_overflow = 2**128
1664        self.assertRaises(TypeError, re.finditer, "a", {})
1665        with self.assertRaises(OverflowError):
1666            _sre.compile("abc", 0, [long_overflow], 0, {}, ())
1667        with self.assertRaises(TypeError):
1668            _sre.compile({}, 0, [], 0, [], [])
1669
1670    def test_search_dot_unicode(self):
1671        self.assertTrue(re.search("123.*-", '123abc-'))
1672        self.assertTrue(re.search("123.*-", '123\xe9-'))
1673        self.assertTrue(re.search("123.*-", '123\u20ac-'))
1674        self.assertTrue(re.search("123.*-", '123\U0010ffff-'))
1675        self.assertTrue(re.search("123.*-", '123\xe9\u20ac\U0010ffff-'))
1676
1677    def test_compile(self):
1678        # Test return value when given string and pattern as parameter
1679        pattern = re.compile('random pattern')
1680        self.assertIsInstance(pattern, re.Pattern)
1681        same_pattern = re.compile(pattern)
1682        self.assertIsInstance(same_pattern, re.Pattern)
1683        self.assertIs(same_pattern, pattern)
1684        # Test behaviour when not given a string or pattern as parameter
1685        self.assertRaises(TypeError, re.compile, 0)
1686
1687    @bigmemtest(size=_2G, memuse=1)
1688    def test_large_search(self, size):
1689        # Issue #10182: indices were 32-bit-truncated.
1690        s = 'a' * size
1691        m = re.search('$', s)
1692        self.assertIsNotNone(m)
1693        self.assertEqual(m.start(), size)
1694        self.assertEqual(m.end(), size)
1695
1696    # The huge memuse is because of re.sub() using a list and a join()
1697    # to create the replacement result.
1698    @bigmemtest(size=_2G, memuse=16 + 2)
1699    def test_large_subn(self, size):
1700        # Issue #10182: indices were 32-bit-truncated.
1701        s = 'a' * size
1702        r, n = re.subn('', '', s)
1703        self.assertEqual(r, s)
1704        self.assertEqual(n, size + 1)
1705
1706    def test_bug_16688(self):
1707        # Issue 16688: Backreferences make case-insensitive regex fail on
1708        # non-ASCII strings.
1709        self.assertEqual(re.findall(r"(?i)(a)\1", "aa \u0100"), ['a'])
1710        self.assertEqual(re.match(r"(?s).{1,3}", "\u0100\u0100").span(), (0, 2))
1711
1712    def test_repeat_minmax_overflow(self):
1713        # Issue #13169
1714        string = "x" * 100000
1715        self.assertEqual(re.match(r".{65535}", string).span(), (0, 65535))
1716        self.assertEqual(re.match(r".{,65535}", string).span(), (0, 65535))
1717        self.assertEqual(re.match(r".{65535,}?", string).span(), (0, 65535))
1718        self.assertEqual(re.match(r".{65536}", string).span(), (0, 65536))
1719        self.assertEqual(re.match(r".{,65536}", string).span(), (0, 65536))
1720        self.assertEqual(re.match(r".{65536,}?", string).span(), (0, 65536))
1721        # 2**128 should be big enough to overflow both SRE_CODE and Py_ssize_t.
1722        self.assertRaises(OverflowError, re.compile, r".{%d}" % 2**128)
1723        self.assertRaises(OverflowError, re.compile, r".{,%d}" % 2**128)
1724        self.assertRaises(OverflowError, re.compile, r".{%d,}?" % 2**128)
1725        self.assertRaises(OverflowError, re.compile, r".{%d,%d}" % (2**129, 2**128))
1726
1727    @cpython_only
1728    def test_repeat_minmax_overflow_maxrepeat(self):
1729        try:
1730            from _sre import MAXREPEAT
1731        except ImportError:
1732            self.skipTest('requires _sre.MAXREPEAT constant')
1733        string = "x" * 100000
1734        self.assertIsNone(re.match(r".{%d}" % (MAXREPEAT - 1), string))
1735        self.assertEqual(re.match(r".{,%d}" % (MAXREPEAT - 1), string).span(),
1736                         (0, 100000))
1737        self.assertIsNone(re.match(r".{%d,}?" % (MAXREPEAT - 1), string))
1738        self.assertRaises(OverflowError, re.compile, r".{%d}" % MAXREPEAT)
1739        self.assertRaises(OverflowError, re.compile, r".{,%d}" % MAXREPEAT)
1740        self.assertRaises(OverflowError, re.compile, r".{%d,}?" % MAXREPEAT)
1741
1742    def test_backref_group_name_in_exception(self):
1743        # Issue 17341: Poor error message when compiling invalid regex
1744        self.checkPatternError('(?P=<foo>)',
1745                               "bad character in group name '<foo>'", 4)
1746
1747    def test_group_name_in_exception(self):
1748        # Issue 17341: Poor error message when compiling invalid regex
1749        self.checkPatternError('(?P<?foo>)',
1750                               "bad character in group name '?foo'", 4)
1751
1752    def test_issue17998(self):
1753        for reps in '*', '+', '?', '{1}':
1754            for mod in '', '?':
1755                pattern = '.' + reps + mod + 'yz'
1756                self.assertEqual(re.compile(pattern, re.S).findall('xyz'),
1757                                 ['xyz'], msg=pattern)
1758                pattern = pattern.encode()
1759                self.assertEqual(re.compile(pattern, re.S).findall(b'xyz'),
1760                                 [b'xyz'], msg=pattern)
1761
1762    def test_match_repr(self):
1763        for string in '[abracadabra]', S('[abracadabra]'):
1764            m = re.search(r'(.+)(.*?)\1', string)
1765            pattern = r"<(%s\.)?%s object; span=\(1, 12\), match='abracadabra'>" % (
1766                type(m).__module__, type(m).__qualname__
1767            )
1768            self.assertRegex(repr(m), pattern)
1769        for string in (b'[abracadabra]', B(b'[abracadabra]'),
1770                       bytearray(b'[abracadabra]'),
1771                       memoryview(b'[abracadabra]')):
1772            m = re.search(br'(.+)(.*?)\1', string)
1773            pattern = r"<(%s\.)?%s object; span=\(1, 12\), match=b'abracadabra'>" % (
1774                type(m).__module__, type(m).__qualname__
1775            )
1776            self.assertRegex(repr(m), pattern)
1777
1778        first, second = list(re.finditer("(aa)|(bb)", "aa bb"))
1779        pattern = r"<(%s\.)?%s object; span=\(0, 2\), match='aa'>" % (
1780            type(second).__module__, type(second).__qualname__
1781        )
1782        self.assertRegex(repr(first), pattern)
1783        pattern = r"<(%s\.)?%s object; span=\(3, 5\), match='bb'>" % (
1784            type(second).__module__, type(second).__qualname__
1785        )
1786        self.assertRegex(repr(second), pattern)
1787
1788    def test_zerowidth(self):
1789        # Issues 852532, 1647489, 3262, 25054.
1790        self.assertEqual(re.split(r"\b", "a::bc"), ['', 'a', '::', 'bc', ''])
1791        self.assertEqual(re.split(r"\b|:+", "a::bc"), ['', 'a', '', '', 'bc', ''])
1792        self.assertEqual(re.split(r"(?<!\w)(?=\w)|:+", "a::bc"), ['', 'a', '', 'bc'])
1793        self.assertEqual(re.split(r"(?<=\w)(?!\w)|:+", "a::bc"), ['a', '', 'bc', ''])
1794
1795        self.assertEqual(re.sub(r"\b", "-", "a::bc"), '-a-::-bc-')
1796        self.assertEqual(re.sub(r"\b|:+", "-", "a::bc"), '-a---bc-')
1797        self.assertEqual(re.sub(r"(\b|:+)", r"[\1]", "a::bc"), '[]a[][::][]bc[]')
1798
1799        self.assertEqual(re.findall(r"\b|:+", "a::bc"), ['', '', '::', '', ''])
1800        self.assertEqual(re.findall(r"\b|\w+", "a::bc"),
1801                         ['', 'a', '', '', 'bc', ''])
1802
1803        self.assertEqual([m.span() for m in re.finditer(r"\b|:+", "a::bc")],
1804                         [(0, 0), (1, 1), (1, 3), (3, 3), (5, 5)])
1805        self.assertEqual([m.span() for m in re.finditer(r"\b|\w+", "a::bc")],
1806                         [(0, 0), (0, 1), (1, 1), (3, 3), (3, 5), (5, 5)])
1807
1808    def test_bug_2537(self):
1809        # issue 2537: empty submatches
1810        for outer_op in ('{0,}', '*', '+', '{1,187}'):
1811            for inner_op in ('{0,}', '*', '?'):
1812                r = re.compile("^((x|y)%s)%s" % (inner_op, outer_op))
1813                m = r.match("xyyzy")
1814                self.assertEqual(m.group(0), "xyy")
1815                self.assertEqual(m.group(1), "")
1816                self.assertEqual(m.group(2), "y")
1817
1818    @cpython_only
1819    def test_debug_flag(self):
1820        pat = r'(\.)(?:[ch]|py)(?(1)$|: )'
1821        with captured_stdout() as out:
1822            re.compile(pat, re.DEBUG)
1823        self.maxDiff = None
1824        dump = '''\
1825SUBPATTERN 1 0 0
1826  LITERAL 46
1827BRANCH
1828  IN
1829    LITERAL 99
1830    LITERAL 104
1831OR
1832  LITERAL 112
1833  LITERAL 121
1834GROUPREF_EXISTS 1
1835  AT AT_END
1836ELSE
1837  LITERAL 58
1838  LITERAL 32
1839
1840 0. INFO 8 0b1 2 5 (to 9)
1841      prefix_skip 0
1842      prefix [0x2e] ('.')
1843      overlap [0]
1844 9: MARK 0
184511. LITERAL 0x2e ('.')
184613. MARK 1
184715. BRANCH 10 (to 26)
184817.   IN 6 (to 24)
184919.     LITERAL 0x63 ('c')
185021.     LITERAL 0x68 ('h')
185123.     FAILURE
185224:   JUMP 9 (to 34)
185326: branch 7 (to 33)
185427.   LITERAL 0x70 ('p')
185529.   LITERAL 0x79 ('y')
185631.   JUMP 2 (to 34)
185733: FAILURE
185834: GROUPREF_EXISTS 0 6 (to 41)
185937. AT END
186039. JUMP 5 (to 45)
186141: LITERAL 0x3a (':')
186243. LITERAL 0x20 (' ')
186345: SUCCESS
1864'''
1865        self.assertEqual(out.getvalue(), dump)
1866        # Debug output is output again even a second time (bypassing
1867        # the cache -- issue #20426).
1868        with captured_stdout() as out:
1869            re.compile(pat, re.DEBUG)
1870        self.assertEqual(out.getvalue(), dump)
1871
1872    def test_keyword_parameters(self):
1873        # Issue #20283: Accepting the string keyword parameter.
1874        pat = re.compile(r'(ab)')
1875        self.assertEqual(
1876            pat.match(string='abracadabra', pos=7, endpos=10).span(), (7, 9))
1877        self.assertEqual(
1878            pat.fullmatch(string='abracadabra', pos=7, endpos=9).span(), (7, 9))
1879        self.assertEqual(
1880            pat.search(string='abracadabra', pos=3, endpos=10).span(), (7, 9))
1881        self.assertEqual(
1882            pat.findall(string='abracadabra', pos=3, endpos=10), ['ab'])
1883        self.assertEqual(
1884            pat.split(string='abracadabra', maxsplit=1),
1885            ['', 'ab', 'racadabra'])
1886        self.assertEqual(
1887            pat.scanner(string='abracadabra', pos=3, endpos=10).search().span(),
1888            (7, 9))
1889
1890    def test_bug_20998(self):
1891        # Issue #20998: Fullmatch of repeated single character pattern
1892        # with ignore case.
1893        self.assertEqual(re.fullmatch('[a-c]+', 'ABC', re.I).span(), (0, 3))
1894
1895    def test_locale_caching(self):
1896        # Issue #22410
1897        oldlocale = locale.setlocale(locale.LC_CTYPE)
1898        self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale)
1899        for loc in 'en_US.iso88591', 'en_US.utf8':
1900            try:
1901                locale.setlocale(locale.LC_CTYPE, loc)
1902            except locale.Error:
1903                # Unsupported locale on this system
1904                self.skipTest('test needs %s locale' % loc)
1905
1906        re.purge()
1907        self.check_en_US_iso88591()
1908        self.check_en_US_utf8()
1909        re.purge()
1910        self.check_en_US_utf8()
1911        self.check_en_US_iso88591()
1912
1913    def check_en_US_iso88591(self):
1914        locale.setlocale(locale.LC_CTYPE, 'en_US.iso88591')
1915        self.assertTrue(re.match(b'\xc5\xe5', b'\xc5\xe5', re.L|re.I))
1916        self.assertTrue(re.match(b'\xc5', b'\xe5', re.L|re.I))
1917        self.assertTrue(re.match(b'\xe5', b'\xc5', re.L|re.I))
1918        self.assertTrue(re.match(b'(?Li)\xc5\xe5', b'\xc5\xe5'))
1919        self.assertTrue(re.match(b'(?Li)\xc5', b'\xe5'))
1920        self.assertTrue(re.match(b'(?Li)\xe5', b'\xc5'))
1921
1922    def check_en_US_utf8(self):
1923        locale.setlocale(locale.LC_CTYPE, 'en_US.utf8')
1924        self.assertTrue(re.match(b'\xc5\xe5', b'\xc5\xe5', re.L|re.I))
1925        self.assertIsNone(re.match(b'\xc5', b'\xe5', re.L|re.I))
1926        self.assertIsNone(re.match(b'\xe5', b'\xc5', re.L|re.I))
1927        self.assertTrue(re.match(b'(?Li)\xc5\xe5', b'\xc5\xe5'))
1928        self.assertIsNone(re.match(b'(?Li)\xc5', b'\xe5'))
1929        self.assertIsNone(re.match(b'(?Li)\xe5', b'\xc5'))
1930
1931    def test_locale_compiled(self):
1932        oldlocale = locale.setlocale(locale.LC_CTYPE)
1933        self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale)
1934        for loc in 'en_US.iso88591', 'en_US.utf8':
1935            try:
1936                locale.setlocale(locale.LC_CTYPE, loc)
1937            except locale.Error:
1938                # Unsupported locale on this system
1939                self.skipTest('test needs %s locale' % loc)
1940
1941        locale.setlocale(locale.LC_CTYPE, 'en_US.iso88591')
1942        p1 = re.compile(b'\xc5\xe5', re.L|re.I)
1943        p2 = re.compile(b'[a\xc5][a\xe5]', re.L|re.I)
1944        p3 = re.compile(b'[az\xc5][az\xe5]', re.L|re.I)
1945        p4 = re.compile(b'[^\xc5][^\xe5]', re.L|re.I)
1946        for p in p1, p2, p3:
1947            self.assertTrue(p.match(b'\xc5\xe5'))
1948            self.assertTrue(p.match(b'\xe5\xe5'))
1949            self.assertTrue(p.match(b'\xc5\xc5'))
1950        self.assertIsNone(p4.match(b'\xe5\xc5'))
1951        self.assertIsNone(p4.match(b'\xe5\xe5'))
1952        self.assertIsNone(p4.match(b'\xc5\xc5'))
1953
1954        locale.setlocale(locale.LC_CTYPE, 'en_US.utf8')
1955        for p in p1, p2, p3:
1956            self.assertTrue(p.match(b'\xc5\xe5'))
1957            self.assertIsNone(p.match(b'\xe5\xe5'))
1958            self.assertIsNone(p.match(b'\xc5\xc5'))
1959        self.assertTrue(p4.match(b'\xe5\xc5'))
1960        self.assertIsNone(p4.match(b'\xe5\xe5'))
1961        self.assertIsNone(p4.match(b'\xc5\xc5'))
1962
1963    def test_error(self):
1964        with self.assertRaises(re.error) as cm:
1965            re.compile('(\u20ac))')
1966        err = cm.exception
1967        self.assertIsInstance(err.pattern, str)
1968        self.assertEqual(err.pattern, '(\u20ac))')
1969        self.assertEqual(err.pos, 3)
1970        self.assertEqual(err.lineno, 1)
1971        self.assertEqual(err.colno, 4)
1972        self.assertIn(err.msg, str(err))
1973        self.assertIn(' at position 3', str(err))
1974        self.assertNotIn(' at position 3', err.msg)
1975        # Bytes pattern
1976        with self.assertRaises(re.error) as cm:
1977            re.compile(b'(\xa4))')
1978        err = cm.exception
1979        self.assertIsInstance(err.pattern, bytes)
1980        self.assertEqual(err.pattern, b'(\xa4))')
1981        self.assertEqual(err.pos, 3)
1982        # Multiline pattern
1983        with self.assertRaises(re.error) as cm:
1984            re.compile("""
1985                (
1986                    abc
1987                )
1988                )
1989                (
1990                """, re.VERBOSE)
1991        err = cm.exception
1992        self.assertEqual(err.pos, 77)
1993        self.assertEqual(err.lineno, 5)
1994        self.assertEqual(err.colno, 17)
1995        self.assertIn(err.msg, str(err))
1996        self.assertIn(' at position 77', str(err))
1997        self.assertIn('(line 5, column 17)', str(err))
1998
1999    def test_misc_errors(self):
2000        self.checkPatternError(r'(', 'missing ), unterminated subpattern', 0)
2001        self.checkPatternError(r'((a|b)', 'missing ), unterminated subpattern', 0)
2002        self.checkPatternError(r'(a|b))', 'unbalanced parenthesis', 5)
2003        self.checkPatternError(r'(?P', 'unexpected end of pattern', 3)
2004        self.checkPatternError(r'(?z)', 'unknown extension ?z', 1)
2005        self.checkPatternError(r'(?iz)', 'unknown flag', 3)
2006        self.checkPatternError(r'(?i', 'missing -, : or )', 3)
2007        self.checkPatternError(r'(?#abc', 'missing ), unterminated comment', 0)
2008        self.checkPatternError(r'(?<', 'unexpected end of pattern', 3)
2009        self.checkPatternError(r'(?<>)', 'unknown extension ?<>', 1)
2010        self.checkPatternError(r'(?', 'unexpected end of pattern', 2)
2011
2012    def test_enum(self):
2013        # Issue #28082: Check that str(flag) returns a human readable string
2014        # instead of an integer
2015        self.assertIn('ASCII', str(re.A))
2016        self.assertIn('DOTALL', str(re.S))
2017
2018    def test_pattern_compare(self):
2019        pattern1 = re.compile('abc', re.IGNORECASE)
2020
2021        # equal to itself
2022        self.assertEqual(pattern1, pattern1)
2023        self.assertFalse(pattern1 != pattern1)
2024
2025        # equal
2026        re.purge()
2027        pattern2 = re.compile('abc', re.IGNORECASE)
2028        self.assertEqual(hash(pattern2), hash(pattern1))
2029        self.assertEqual(pattern2, pattern1)
2030
2031        # not equal: different pattern
2032        re.purge()
2033        pattern3 = re.compile('XYZ', re.IGNORECASE)
2034        # Don't test hash(pattern3) != hash(pattern1) because there is no
2035        # warranty that hash values are different
2036        self.assertNotEqual(pattern3, pattern1)
2037
2038        # not equal: different flag (flags=0)
2039        re.purge()
2040        pattern4 = re.compile('abc')
2041        self.assertNotEqual(pattern4, pattern1)
2042
2043        # only == and != comparison operators are supported
2044        with self.assertRaises(TypeError):
2045            pattern1 < pattern2
2046
2047    def test_pattern_compare_bytes(self):
2048        pattern1 = re.compile(b'abc')
2049
2050        # equal: test bytes patterns
2051        re.purge()
2052        pattern2 = re.compile(b'abc')
2053        self.assertEqual(hash(pattern2), hash(pattern1))
2054        self.assertEqual(pattern2, pattern1)
2055
2056        # not equal: pattern of a different types (str vs bytes),
2057        # comparison must not raise a BytesWarning
2058        re.purge()
2059        pattern3 = re.compile('abc')
2060        with warnings.catch_warnings():
2061            warnings.simplefilter('error', BytesWarning)
2062            self.assertNotEqual(pattern3, pattern1)
2063
2064    def test_bug_29444(self):
2065        s = bytearray(b'abcdefgh')
2066        m = re.search(b'[a-h]+', s)
2067        m2 = re.search(b'[e-h]+', s)
2068        self.assertEqual(m.group(), b'abcdefgh')
2069        self.assertEqual(m2.group(), b'efgh')
2070        s[:] = b'xyz'
2071        self.assertEqual(m.group(), b'xyz')
2072        self.assertEqual(m2.group(), b'')
2073
2074    def test_bug_34294(self):
2075        # Issue 34294: wrong capturing groups
2076
2077        # exists since Python 2
2078        s = "a\tx"
2079        p = r"\b(?=(\t)|(x))x"
2080        self.assertEqual(re.search(p, s).groups(), (None, 'x'))
2081
2082        # introduced in Python 3.7.0
2083        s = "ab"
2084        p = r"(?=(.)(.)?)"
2085        self.assertEqual(re.findall(p, s),
2086                         [('a', 'b'), ('b', '')])
2087        self.assertEqual([m.groups() for m in re.finditer(p, s)],
2088                         [('a', 'b'), ('b', None)])
2089
2090        # test-cases provided by issue34294, introduced in Python 3.7.0
2091        p = r"(?=<(?P<tag>\w+)/?>(?:(?P<text>.+?)</(?P=tag)>)?)"
2092        s = "<test><foo2/></test>"
2093        self.assertEqual(re.findall(p, s),
2094                         [('test', '<foo2/>'), ('foo2', '')])
2095        self.assertEqual([m.groupdict() for m in re.finditer(p, s)],
2096                         [{'tag': 'test', 'text': '<foo2/>'},
2097                          {'tag': 'foo2', 'text': None}])
2098        s = "<test>Hello</test><foo/>"
2099        self.assertEqual([m.groupdict() for m in re.finditer(p, s)],
2100                         [{'tag': 'test', 'text': 'Hello'},
2101                          {'tag': 'foo', 'text': None}])
2102        s = "<test>Hello</test><foo/><foo/>"
2103        self.assertEqual([m.groupdict() for m in re.finditer(p, s)],
2104                         [{'tag': 'test', 'text': 'Hello'},
2105                          {'tag': 'foo', 'text': None},
2106                          {'tag': 'foo', 'text': None}])
2107
2108
2109class PatternReprTests(unittest.TestCase):
2110    def check(self, pattern, expected):
2111        self.assertEqual(repr(re.compile(pattern)), expected)
2112
2113    def check_flags(self, pattern, flags, expected):
2114        self.assertEqual(repr(re.compile(pattern, flags)), expected)
2115
2116    def test_without_flags(self):
2117        self.check('random pattern',
2118                   "re.compile('random pattern')")
2119
2120    def test_single_flag(self):
2121        self.check_flags('random pattern', re.IGNORECASE,
2122            "re.compile('random pattern', re.IGNORECASE)")
2123
2124    def test_multiple_flags(self):
2125        self.check_flags('random pattern', re.I|re.S|re.X,
2126            "re.compile('random pattern', "
2127            "re.IGNORECASE|re.DOTALL|re.VERBOSE)")
2128
2129    def test_unicode_flag(self):
2130        self.check_flags('random pattern', re.U,
2131                         "re.compile('random pattern')")
2132        self.check_flags('random pattern', re.I|re.S|re.U,
2133                         "re.compile('random pattern', "
2134                         "re.IGNORECASE|re.DOTALL)")
2135
2136    def test_inline_flags(self):
2137        self.check('(?i)pattern',
2138                   "re.compile('(?i)pattern', re.IGNORECASE)")
2139
2140    def test_unknown_flags(self):
2141        self.check_flags('random pattern', 0x123000,
2142                         "re.compile('random pattern', 0x123000)")
2143        self.check_flags('random pattern', 0x123000|re.I,
2144            "re.compile('random pattern', re.IGNORECASE|0x123000)")
2145
2146    def test_bytes(self):
2147        self.check(b'bytes pattern',
2148                   "re.compile(b'bytes pattern')")
2149        self.check_flags(b'bytes pattern', re.A,
2150                         "re.compile(b'bytes pattern', re.ASCII)")
2151
2152    def test_locale(self):
2153        self.check_flags(b'bytes pattern', re.L,
2154                         "re.compile(b'bytes pattern', re.LOCALE)")
2155
2156    def test_quotes(self):
2157        self.check('random "double quoted" pattern',
2158            '''re.compile('random "double quoted" pattern')''')
2159        self.check("random 'single quoted' pattern",
2160            '''re.compile("random 'single quoted' pattern")''')
2161        self.check('''both 'single' and "double" quotes''',
2162            '''re.compile('both \\'single\\' and "double" quotes')''')
2163
2164    def test_long_pattern(self):
2165        pattern = 'Very %spattern' % ('long ' * 1000)
2166        r = repr(re.compile(pattern))
2167        self.assertLess(len(r), 300)
2168        self.assertEqual(r[:30], "re.compile('Very long long lon")
2169        r = repr(re.compile(pattern, re.I))
2170        self.assertLess(len(r), 300)
2171        self.assertEqual(r[:30], "re.compile('Very long long lon")
2172        self.assertEqual(r[-16:], ", re.IGNORECASE)")
2173
2174    def test_flags_repr(self):
2175        self.assertEqual(repr(re.I), "re.IGNORECASE")
2176        self.assertEqual(repr(re.I|re.S|re.X),
2177                         "re.IGNORECASE|re.DOTALL|re.VERBOSE")
2178        self.assertEqual(repr(re.I|re.S|re.X|(1<<20)),
2179                         "re.IGNORECASE|re.DOTALL|re.VERBOSE|0x100000")
2180        self.assertEqual(repr(~re.I), "~re.IGNORECASE")
2181        self.assertEqual(repr(~(re.I|re.S|re.X)),
2182                         "~(re.IGNORECASE|re.DOTALL|re.VERBOSE)")
2183        self.assertEqual(repr(~(re.I|re.S|re.X|(1<<20))),
2184                         "~(re.IGNORECASE|re.DOTALL|re.VERBOSE|0x100000)")
2185
2186
2187class ImplementationTest(unittest.TestCase):
2188    """
2189    Test implementation details of the re module.
2190    """
2191
2192    @cpython_only
2193    def test_immutable(self):
2194        # bpo-43908: check that re types are immutable
2195        with self.assertRaises(TypeError):
2196            re.Match.foo = 1
2197        with self.assertRaises(TypeError):
2198            re.Pattern.foo = 1
2199        with self.assertRaises(TypeError):
2200            pat = re.compile("")
2201            tp = type(pat.scanner(""))
2202            tp.foo = 1
2203
2204    def test_overlap_table(self):
2205        f = sre_compile._generate_overlap_table
2206        self.assertEqual(f(""), [])
2207        self.assertEqual(f("a"), [0])
2208        self.assertEqual(f("abcd"), [0, 0, 0, 0])
2209        self.assertEqual(f("aaaa"), [0, 1, 2, 3])
2210        self.assertEqual(f("ababba"), [0, 0, 1, 2, 0, 1])
2211        self.assertEqual(f("abcabdac"), [0, 0, 0, 1, 2, 0, 1, 0])
2212
2213    def test_signedness(self):
2214        self.assertGreaterEqual(sre_compile.MAXREPEAT, 0)
2215        self.assertGreaterEqual(sre_compile.MAXGROUPS, 0)
2216
2217    @cpython_only
2218    def test_disallow_instantiation(self):
2219        # Ensure that the type disallows instantiation (bpo-43916)
2220        check_disallow_instantiation(self, re.Match)
2221        check_disallow_instantiation(self, re.Pattern)
2222        pat = re.compile("")
2223        check_disallow_instantiation(self, type(pat.scanner("")))
2224
2225
2226class ExternalTests(unittest.TestCase):
2227
2228    def test_re_benchmarks(self):
2229        're_tests benchmarks'
2230        from test.re_tests import benchmarks
2231        for pattern, s in benchmarks:
2232            with self.subTest(pattern=pattern, string=s):
2233                p = re.compile(pattern)
2234                self.assertTrue(p.search(s))
2235                self.assertTrue(p.match(s))
2236                self.assertTrue(p.fullmatch(s))
2237                s2 = ' '*10000 + s + ' '*10000
2238                self.assertTrue(p.search(s2))
2239                self.assertTrue(p.match(s2, 10000))
2240                self.assertTrue(p.match(s2, 10000, 10000 + len(s)))
2241                self.assertTrue(p.fullmatch(s2, 10000, 10000 + len(s)))
2242
2243    def test_re_tests(self):
2244        're_tests test suite'
2245        from test.re_tests import tests, FAIL, SYNTAX_ERROR
2246        for t in tests:
2247            pattern = s = outcome = repl = expected = None
2248            if len(t) == 5:
2249                pattern, s, outcome, repl, expected = t
2250            elif len(t) == 3:
2251                pattern, s, outcome = t
2252            else:
2253                raise ValueError('Test tuples should have 3 or 5 fields', t)
2254
2255            with self.subTest(pattern=pattern, string=s):
2256                if outcome == SYNTAX_ERROR:  # Expected a syntax error
2257                    with self.assertRaises(re.error):
2258                        re.compile(pattern)
2259                    continue
2260
2261                obj = re.compile(pattern)
2262                result = obj.search(s)
2263                if outcome == FAIL:
2264                    self.assertIsNone(result, 'Succeeded incorrectly')
2265                    continue
2266
2267                with self.subTest():
2268                    self.assertTrue(result, 'Failed incorrectly')
2269                    # Matched, as expected, so now we compute the
2270                    # result string and compare it to our expected result.
2271                    start, end = result.span(0)
2272                    vardict = {'found': result.group(0),
2273                               'groups': result.group(),
2274                               'flags': result.re.flags}
2275                    for i in range(1, 100):
2276                        try:
2277                            gi = result.group(i)
2278                            # Special hack because else the string concat fails:
2279                            if gi is None:
2280                                gi = "None"
2281                        except IndexError:
2282                            gi = "Error"
2283                        vardict['g%d' % i] = gi
2284                    for i in result.re.groupindex.keys():
2285                        try:
2286                            gi = result.group(i)
2287                            if gi is None:
2288                                gi = "None"
2289                        except IndexError:
2290                            gi = "Error"
2291                        vardict[i] = gi
2292                    self.assertEqual(eval(repl, vardict), expected,
2293                                     'grouping error')
2294
2295                # Try the match with both pattern and string converted to
2296                # bytes, and check that it still succeeds.
2297                try:
2298                    bpat = bytes(pattern, "ascii")
2299                    bs = bytes(s, "ascii")
2300                except UnicodeEncodeError:
2301                    # skip non-ascii tests
2302                    pass
2303                else:
2304                    with self.subTest('bytes pattern match'):
2305                        obj = re.compile(bpat)
2306                        self.assertTrue(obj.search(bs))
2307
2308                    # Try the match with LOCALE enabled, and check that it
2309                    # still succeeds.
2310                    with self.subTest('locale-sensitive match'):
2311                        obj = re.compile(bpat, re.LOCALE)
2312                        result = obj.search(bs)
2313                        if result is None:
2314                            print('=== Fails on locale-sensitive match', t)
2315
2316                # Try the match with the search area limited to the extent
2317                # of the match and see if it still succeeds.  \B will
2318                # break (because it won't match at the end or start of a
2319                # string), so we'll ignore patterns that feature it.
2320                if (pattern[:2] != r'\B' and pattern[-2:] != r'\B'
2321                            and result is not None):
2322                    with self.subTest('range-limited match'):
2323                        obj = re.compile(pattern)
2324                        self.assertTrue(obj.search(s, start, end + 1))
2325
2326                # Try the match with IGNORECASE enabled, and check that it
2327                # still succeeds.
2328                with self.subTest('case-insensitive match'):
2329                    obj = re.compile(pattern, re.IGNORECASE)
2330                    self.assertTrue(obj.search(s))
2331
2332                # Try the match with UNICODE locale enabled, and check
2333                # that it still succeeds.
2334                with self.subTest('unicode-sensitive match'):
2335                    obj = re.compile(pattern, re.UNICODE)
2336                    self.assertTrue(obj.search(s))
2337
2338
2339if __name__ == "__main__":
2340    unittest.main()
2341