• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1# -*- coding: utf-8 -*-
2from test.test_support import (
3    verbose, run_unittest, import_module,
4    precisionbigmemtest, _2G, cpython_only,
5    captured_stdout, have_unicode, requires_unicode, u,
6    check_warnings, check_py3k_warnings)
7import locale
8import re
9from re import Scanner
10import sre_constants
11import sys
12import string
13import traceback
14from weakref import proxy
15
16
17# Misc tests from Tim Peters' re.doc
18
19# WARNING: Don't change details in these tests if you don't know
20# what you're doing. Some of these tests were carefully modeled to
21# cover most of the code.
22
23import unittest
24
25class ReTests(unittest.TestCase):
26
27    def test_weakref(self):
28        s = 'QabbbcR'
29        x = re.compile('ab+c')
30        y = proxy(x)
31        self.assertEqual(x.findall('QabbbcR'), y.findall('QabbbcR'))
32
33    def test_search_star_plus(self):
34        self.assertEqual(re.search('x*', 'axx').span(0), (0, 0))
35        self.assertEqual(re.search('x*', 'axx').span(), (0, 0))
36        self.assertEqual(re.search('x+', 'axx').span(0), (1, 3))
37        self.assertEqual(re.search('x+', 'axx').span(), (1, 3))
38        self.assertIsNone(re.search('x', 'aaa'))
39        self.assertEqual(re.match('a*', 'xxx').span(0), (0, 0))
40        self.assertEqual(re.match('a*', 'xxx').span(), (0, 0))
41        self.assertEqual(re.match('x*', 'xxxa').span(0), (0, 3))
42        self.assertEqual(re.match('x*', 'xxxa').span(), (0, 3))
43        self.assertIsNone(re.match('a+', 'xxx'))
44
45    def bump_num(self, matchobj):
46        int_value = int(matchobj.group(0))
47        return str(int_value + 1)
48
49    def test_basic_re_sub(self):
50        self.assertEqual(re.sub("(?i)b+", "x", "bbbb BBBB"), 'x x')
51        self.assertEqual(re.sub(r'\d+', self.bump_num, '08.2 -2 23x99y'),
52                         '9.3 -3 24x100y')
53        self.assertEqual(re.sub(r'\d+', self.bump_num, '08.2 -2 23x99y', 3),
54                         '9.3 -3 23x99y')
55
56        self.assertEqual(re.sub('.', lambda m: r"\n", 'x'), '\\n')
57        self.assertEqual(re.sub('.', r"\n", 'x'), '\n')
58
59        s = r"\1\1"
60        self.assertEqual(re.sub('(.)', s, 'x'), 'xx')
61        self.assertEqual(re.sub('(.)', s.replace('\\', r'\\'), 'x'), s)
62        self.assertEqual(re.sub('(.)', lambda m: s, 'x'), s)
63
64        self.assertEqual(re.sub('(?P<a>x)', '\g<a>\g<a>', 'xx'), 'xxxx')
65        self.assertEqual(re.sub('(?P<a>x)', '\g<a>\g<1>', 'xx'), 'xxxx')
66        self.assertEqual(re.sub('(?P<unk>x)', '\g<unk>\g<unk>', 'xx'), 'xxxx')
67        self.assertEqual(re.sub('(?P<unk>x)', '\g<1>\g<1>', 'xx'), 'xxxx')
68
69        self.assertEqual(re.sub('a', r'\t\n\v\r\f\a\b', 'a'), '\t\n\v\r\f\a\b')
70        self.assertEqual(re.sub('a', '\t\n\v\r\f\a\b', 'a'), '\t\n\v\r\f\a\b')
71        self.assertEqual(re.sub('a', '\t\n\v\r\f\a\b', 'a'),
72                         (chr(9)+chr(10)+chr(11)+chr(13)+chr(12)+chr(7)+chr(8)))
73        for c in 'cdehijklmopqsuwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ':
74            with check_py3k_warnings():
75                self.assertEqual(re.sub('a', '\\' + c, 'a'), '\\' + c)
76
77        self.assertEqual(re.sub('^\s*', 'X', 'test'), 'Xtest')
78
79    def test_bug_449964(self):
80        # fails for group followed by other escape
81        self.assertEqual(re.sub(r'(?P<unk>x)', '\g<1>\g<1>\\b', 'xx'),
82                         'xx\bxx\b')
83
84    def test_bug_449000(self):
85        # Test for sub() on escaped characters
86        self.assertEqual(re.sub(r'\r\n', r'\n', 'abc\r\ndef\r\n'),
87                         'abc\ndef\n')
88        self.assertEqual(re.sub('\r\n', r'\n', 'abc\r\ndef\r\n'),
89                         'abc\ndef\n')
90        self.assertEqual(re.sub(r'\r\n', '\n', 'abc\r\ndef\r\n'),
91                         'abc\ndef\n')
92        self.assertEqual(re.sub('\r\n', '\n', 'abc\r\ndef\r\n'),
93                         'abc\ndef\n')
94
95    @requires_unicode
96    def test_bug_1140(self):
97        # re.sub(x, y, u'') should return u'', not '', and
98        # re.sub(x, y, '') should return '', not u''.
99        # Also:
100        # re.sub(x, y, unicode(x)) should return unicode(y), and
101        # re.sub(x, y, str(x)) should return
102        #     str(y) if isinstance(y, str) else unicode(y).
103        for x in 'x', u'x':
104            for y in 'y', u'y':
105                z = re.sub(x, y, u'')
106                self.assertEqual(z, u'')
107                self.assertEqual(type(z), unicode)
108                #
109                z = re.sub(x, y, '')
110                self.assertEqual(z, '')
111                self.assertEqual(type(z), str)
112                #
113                z = re.sub(x, y, unicode(x))
114                self.assertEqual(z, y)
115                self.assertEqual(type(z), unicode)
116                #
117                z = re.sub(x, y, str(x))
118                self.assertEqual(z, y)
119                self.assertEqual(type(z), type(y))
120
121    def test_bug_1661(self):
122        # Verify that flags do not get silently ignored with compiled patterns
123        pattern = re.compile('.')
124        self.assertRaises(ValueError, re.match, pattern, 'A', re.I)
125        self.assertRaises(ValueError, re.search, pattern, 'A', re.I)
126        self.assertRaises(ValueError, re.findall, pattern, 'A', re.I)
127        self.assertRaises(ValueError, re.compile, pattern, re.I)
128
129    def test_bug_3629(self):
130        # A regex that triggered a bug in the sre-code validator
131        re.compile("(?P<quote>)(?(quote))")
132
133    def test_sub_template_numeric_escape(self):
134        # bug 776311 and friends
135        self.assertEqual(re.sub('x', r'\0', 'x'), '\0')
136        self.assertEqual(re.sub('x', r'\000', 'x'), '\000')
137        self.assertEqual(re.sub('x', r'\001', 'x'), '\001')
138        self.assertEqual(re.sub('x', r'\008', 'x'), '\0' + '8')
139        self.assertEqual(re.sub('x', r'\009', 'x'), '\0' + '9')
140        self.assertEqual(re.sub('x', r'\111', 'x'), '\111')
141        self.assertEqual(re.sub('x', r'\117', 'x'), '\117')
142
143        self.assertEqual(re.sub('x', r'\1111', 'x'), '\1111')
144        self.assertEqual(re.sub('x', r'\1111', 'x'), '\111' + '1')
145
146        self.assertEqual(re.sub('x', r'\00', 'x'), '\x00')
147        self.assertEqual(re.sub('x', r'\07', 'x'), '\x07')
148        self.assertEqual(re.sub('x', r'\08', 'x'), '\0' + '8')
149        self.assertEqual(re.sub('x', r'\09', 'x'), '\0' + '9')
150        self.assertEqual(re.sub('x', r'\0a', 'x'), '\0' + 'a')
151
152        self.assertEqual(re.sub('x', r'\400', 'x'), '\0')
153        self.assertEqual(re.sub('x', r'\777', 'x'), '\377')
154
155        self.assertRaises(re.error, re.sub, 'x', r'\1', 'x')
156        self.assertRaises(re.error, re.sub, 'x', r'\8', 'x')
157        self.assertRaises(re.error, re.sub, 'x', r'\9', 'x')
158        self.assertRaises(re.error, re.sub, 'x', r'\11', 'x')
159        self.assertRaises(re.error, re.sub, 'x', r'\18', 'x')
160        self.assertRaises(re.error, re.sub, 'x', r'\1a', 'x')
161        self.assertRaises(re.error, re.sub, 'x', r'\90', 'x')
162        self.assertRaises(re.error, re.sub, 'x', r'\99', 'x')
163        self.assertRaises(re.error, re.sub, 'x', r'\118', 'x') # r'\11' + '8'
164        self.assertRaises(re.error, re.sub, 'x', r'\11a', 'x')
165        self.assertRaises(re.error, re.sub, 'x', r'\181', 'x') # r'\18' + '1'
166        self.assertRaises(re.error, re.sub, 'x', r'\800', 'x') # r'\80' + '0'
167
168        # in python2.3 (etc), these loop endlessly in sre_parser.py
169        self.assertEqual(re.sub('(((((((((((x)))))))))))', r'\11', 'x'), 'x')
170        self.assertEqual(re.sub('((((((((((y))))))))))(.)', r'\118', 'xyz'),
171                         'xz8')
172        self.assertEqual(re.sub('((((((((((y))))))))))(.)', r'\11a', 'xyz'),
173                         'xza')
174
175    def test_qualified_re_sub(self):
176        self.assertEqual(re.sub('a', 'b', 'aaaaa'), 'bbbbb')
177        self.assertEqual(re.sub('a', 'b', 'aaaaa', 1), 'baaaa')
178
179    def test_bug_114660(self):
180        self.assertEqual(re.sub(r'(\S)\s+(\S)', r'\1 \2', 'hello  there'),
181                         'hello there')
182
183    def test_bug_462270(self):
184        # Test for empty sub() behaviour, see SF bug #462270
185        self.assertEqual(re.sub('x*', '-', 'abxd'), '-a-b-d-')
186        self.assertEqual(re.sub('x+', '-', 'abxd'), 'ab-d')
187
188    def test_symbolic_groups(self):
189        re.compile('(?P<a>x)(?P=a)(?(a)y)')
190        re.compile('(?P<a1>x)(?P=a1)(?(a1)y)')
191        self.assertRaises(re.error, re.compile, '(?P<a>)(?P<a>)')
192        self.assertRaises(re.error, re.compile, '(?Px)')
193        self.assertRaises(re.error, re.compile, '(?P=)')
194        self.assertRaises(re.error, re.compile, '(?P=1)')
195        self.assertRaises(re.error, re.compile, '(?P=a)')
196        self.assertRaises(re.error, re.compile, '(?P=a1)')
197        self.assertRaises(re.error, re.compile, '(?P=a.)')
198        self.assertRaises(re.error, re.compile, '(?P<)')
199        self.assertRaises(re.error, re.compile, '(?P<>)')
200        self.assertRaises(re.error, re.compile, '(?P<1>)')
201        self.assertRaises(re.error, re.compile, '(?P<a.>)')
202        self.assertRaises(re.error, re.compile, '(?())')
203        self.assertRaises(re.error, re.compile, '(?(a))')
204        self.assertRaises(re.error, re.compile, '(?(1a))')
205        self.assertRaises(re.error, re.compile, '(?(a.))')
206
207    def test_symbolic_refs(self):
208        self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<a', 'xx')
209        self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<', 'xx')
210        self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g', 'xx')
211        self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<a a>', 'xx')
212        self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<>', 'xx')
213        self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<1a1>', 'xx')
214        self.assertRaises(IndexError, re.sub, '(?P<a>x)', '\g<ab>', 'xx')
215        self.assertRaises(re.error, re.sub, '(?P<a>x)|(?P<b>y)', '\g<b>', 'xx')
216        self.assertRaises(re.error, re.sub, '(?P<a>x)|(?P<b>y)', '\\2', 'xx')
217        self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<-1>', 'xx')
218
219    def test_re_subn(self):
220        self.assertEqual(re.subn("(?i)b+", "x", "bbbb BBBB"), ('x x', 2))
221        self.assertEqual(re.subn("b+", "x", "bbbb BBBB"), ('x BBBB', 1))
222        self.assertEqual(re.subn("b+", "x", "xyz"), ('xyz', 0))
223        self.assertEqual(re.subn("b*", "x", "xyz"), ('xxxyxzx', 4))
224        self.assertEqual(re.subn("b*", "x", "xyz", 2), ('xxxyz', 2))
225
226    def test_re_split(self):
227        self.assertEqual(re.split(":", ":a:b::c"), ['', 'a', 'b', '', 'c'])
228        self.assertEqual(re.split(":+", ":a:b::c"), ['', 'a', 'b', 'c'])
229        self.assertEqual(re.split("(:+)", ":a:b::c"),
230                         ['', ':', 'a', ':', 'b', '::', 'c'])
231        self.assertEqual(re.split("(?::+)", ":a:b::c"), ['', 'a', 'b', 'c'])
232        self.assertEqual(re.split("(:)+", ":a:b::c"),
233                         ['', ':', 'a', ':', 'b', ':', 'c'])
234        self.assertEqual(re.split("([b:]+)", ":a:b::c"),
235                         ['', ':', 'a', ':b::', 'c'])
236        self.assertEqual(re.split("(b)|(:+)", ":a:b::c"),
237                         ['', None, ':', 'a', None, ':', '', 'b', None, '',
238                          None, '::', 'c'])
239        self.assertEqual(re.split("(?:b)|(?::+)", ":a:b::c"),
240                         ['', 'a', '', '', 'c'])
241
242        for sep, expected in [
243            (':*', ['', 'a', 'b', 'c']),
244            ('(?::*)', ['', 'a', 'b', 'c']),
245            ('(:*)', ['', ':', 'a', ':', 'b', '::', 'c']),
246            ('(:)*', ['', ':', 'a', ':', 'b', ':', 'c']),
247        ]:
248            with check_py3k_warnings(('', FutureWarning)):
249                self.assertEqual(re.split(sep, ':a:b::c'), expected)
250
251        for sep, expected in [
252            ('', [':a:b::c']),
253            (r'\b', [':a:b::c']),
254            (r'(?=:)', [':a:b::c']),
255            (r'(?<=:)', [':a:b::c']),
256        ]:
257            with check_py3k_warnings():
258                self.assertEqual(re.split(sep, ':a:b::c'), expected)
259
260    def test_qualified_re_split(self):
261        self.assertEqual(re.split(":", ":a:b::c", 2), ['', 'a', 'b::c'])
262        self.assertEqual(re.split(':', 'a:b:c:d', 2), ['a', 'b', 'c:d'])
263        self.assertEqual(re.split("(:)", ":a:b::c", 2),
264                         ['', ':', 'a', ':', 'b::c'])
265        self.assertEqual(re.split("(:+)", ":a:b::c", 2),
266                         ['', ':', 'a', ':', 'b::c'])
267        with check_py3k_warnings(('', FutureWarning)):
268            self.assertEqual(re.split("(:*)", ":a:b::c", maxsplit=2),
269                             ['', ':', 'a', ':', 'b::c'])
270
271    def test_re_findall(self):
272        self.assertEqual(re.findall(":+", "abc"), [])
273        self.assertEqual(re.findall(":+", "a:b::c:::d"), [":", "::", ":::"])
274        self.assertEqual(re.findall("(:+)", "a:b::c:::d"), [":", "::", ":::"])
275        self.assertEqual(re.findall("(:)(:*)", "a:b::c:::d"), [(":", ""),
276                                                               (":", ":"),
277                                                               (":", "::")])
278
279    def test_bug_117612(self):
280        self.assertEqual(re.findall(r"(a|(b))", "aba"),
281                         [("a", ""),("b", "b"),("a", "")])
282
283    def test_re_match(self):
284        self.assertEqual(re.match('a', 'a').groups(), ())
285        self.assertEqual(re.match('(a)', 'a').groups(), ('a',))
286        self.assertEqual(re.match(r'(a)', 'a').group(0), 'a')
287        self.assertEqual(re.match(r'(a)', 'a').group(1), 'a')
288        self.assertEqual(re.match(r'(a)', 'a').group(1, 1), ('a', 'a'))
289
290        pat = re.compile('((a)|(b))(c)?')
291        self.assertEqual(pat.match('a').groups(), ('a', 'a', None, None))
292        self.assertEqual(pat.match('b').groups(), ('b', None, 'b', None))
293        self.assertEqual(pat.match('ac').groups(), ('a', 'a', None, 'c'))
294        self.assertEqual(pat.match('bc').groups(), ('b', None, 'b', 'c'))
295        self.assertEqual(pat.match('bc').groups(""), ('b', "", 'b', 'c'))
296
297        # A single group
298        m = re.match('(a)', 'a')
299        self.assertEqual(m.group(0), 'a')
300        self.assertEqual(m.group(0), 'a')
301        self.assertEqual(m.group(1), 'a')
302        self.assertEqual(m.group(1, 1), ('a', 'a'))
303
304        pat = re.compile('(?:(?P<a1>a)|(?P<b2>b))(?P<c3>c)?')
305        self.assertEqual(pat.match('a').group(1, 2, 3), ('a', None, None))
306        self.assertEqual(pat.match('b').group('a1', 'b2', 'c3'),
307                         (None, 'b', None))
308        self.assertEqual(pat.match('ac').group(1, 'b2', 3), ('a', None, 'c'))
309
310    def test_re_groupref_exists(self):
311        self.assertEqual(re.match('^(\()?([^()]+)(?(1)\))$', '(a)').groups(),
312                         ('(', 'a'))
313        self.assertEqual(re.match('^(\()?([^()]+)(?(1)\))$', 'a').groups(),
314                         (None, 'a'))
315        self.assertIsNone(re.match('^(\()?([^()]+)(?(1)\))$', 'a)'))
316        self.assertIsNone(re.match('^(\()?([^()]+)(?(1)\))$', '(a'))
317        self.assertEqual(re.match('^(?:(a)|c)((?(1)b|d))$', 'ab').groups(),
318                         ('a', 'b'))
319        self.assertEqual(re.match('^(?:(a)|c)((?(1)b|d))$', 'cd').groups(),
320                         (None, 'd'))
321        self.assertEqual(re.match('^(?:(a)|c)((?(1)|d))$', 'cd').groups(),
322                         (None, 'd'))
323        self.assertEqual(re.match('^(?:(a)|c)((?(1)|d))$', 'a').groups(),
324                         ('a', ''))
325
326        # Tests for bug #1177831: exercise groups other than the first group
327        p = re.compile('(?P<g1>a)(?P<g2>b)?((?(g2)c|d))')
328        self.assertEqual(p.match('abc').groups(),
329                         ('a', 'b', 'c'))
330        self.assertEqual(p.match('ad').groups(),
331                         ('a', None, 'd'))
332        self.assertIsNone(p.match('abd'))
333        self.assertIsNone(p.match('ac'))
334
335
336    def test_re_groupref(self):
337        self.assertEqual(re.match(r'^(\|)?([^()]+)\1$', '|a|').groups(),
338                         ('|', 'a'))
339        self.assertEqual(re.match(r'^(\|)?([^()]+)\1?$', 'a').groups(),
340                         (None, 'a'))
341        self.assertIsNone(re.match(r'^(\|)?([^()]+)\1$', 'a|'))
342        self.assertIsNone(re.match(r'^(\|)?([^()]+)\1$', '|a'))
343        self.assertEqual(re.match(r'^(?:(a)|c)(\1)$', 'aa').groups(),
344                         ('a', 'a'))
345        self.assertEqual(re.match(r'^(?:(a)|c)(\1)?$', 'c').groups(),
346                         (None, None))
347
348    def test_groupdict(self):
349        self.assertEqual(re.match('(?P<first>first) (?P<second>second)',
350                                  'first second').groupdict(),
351                         {'first':'first', 'second':'second'})
352
353    def test_expand(self):
354        self.assertEqual(re.match("(?P<first>first) (?P<second>second)",
355                                  "first second")
356                                  .expand(r"\2 \1 \g<second> \g<first>"),
357                         "second first second first")
358
359    def test_repeat_minmax(self):
360        self.assertIsNone(re.match("^(\w){1}$", "abc"))
361        self.assertIsNone(re.match("^(\w){1}?$", "abc"))
362        self.assertIsNone(re.match("^(\w){1,2}$", "abc"))
363        self.assertIsNone(re.match("^(\w){1,2}?$", "abc"))
364
365        self.assertEqual(re.match("^(\w){3}$", "abc").group(1), "c")
366        self.assertEqual(re.match("^(\w){1,3}$", "abc").group(1), "c")
367        self.assertEqual(re.match("^(\w){1,4}$", "abc").group(1), "c")
368        self.assertEqual(re.match("^(\w){3,4}?$", "abc").group(1), "c")
369        self.assertEqual(re.match("^(\w){3}?$", "abc").group(1), "c")
370        self.assertEqual(re.match("^(\w){1,3}?$", "abc").group(1), "c")
371        self.assertEqual(re.match("^(\w){1,4}?$", "abc").group(1), "c")
372        self.assertEqual(re.match("^(\w){3,4}?$", "abc").group(1), "c")
373
374        self.assertIsNone(re.match("^x{1}$", "xxx"))
375        self.assertIsNone(re.match("^x{1}?$", "xxx"))
376        self.assertIsNone(re.match("^x{1,2}$", "xxx"))
377        self.assertIsNone(re.match("^x{1,2}?$", "xxx"))
378
379        self.assertTrue(re.match("^x{3}$", "xxx"))
380        self.assertTrue(re.match("^x{1,3}$", "xxx"))
381        self.assertTrue(re.match("^x{1,4}$", "xxx"))
382        self.assertTrue(re.match("^x{3,4}?$", "xxx"))
383        self.assertTrue(re.match("^x{3}?$", "xxx"))
384        self.assertTrue(re.match("^x{1,3}?$", "xxx"))
385        self.assertTrue(re.match("^x{1,4}?$", "xxx"))
386        self.assertTrue(re.match("^x{3,4}?$", "xxx"))
387
388        self.assertIsNone(re.match("^x{}$", "xxx"))
389        self.assertTrue(re.match("^x{}$", "x{}"))
390
391    def test_getattr(self):
392        self.assertEqual(re.match("(a)", "a").pos, 0)
393        self.assertEqual(re.match("(a)", "a").endpos, 1)
394        self.assertEqual(re.match("(a)", "a").string, "a")
395        self.assertEqual(re.match("(a)", "a").regs, ((0, 1), (0, 1)))
396        self.assertTrue(re.match("(a)", "a").re)
397
398    def test_special_escapes(self):
399        self.assertEqual(re.search(r"\b(b.)\b",
400                                   "abcd abc bcd bx").group(1), "bx")
401        self.assertEqual(re.search(r"\B(b.)\B",
402                                   "abc bcd bc abxd").group(1), "bx")
403        self.assertEqual(re.search(r"\b(b.)\b",
404                                   "abcd abc bcd bx", re.LOCALE).group(1), "bx")
405        self.assertEqual(re.search(r"\B(b.)\B",
406                                   "abc bcd bc abxd", re.LOCALE).group(1), "bx")
407        if have_unicode:
408            self.assertEqual(re.search(r"\b(b.)\b",
409                                       "abcd abc bcd bx", re.UNICODE).group(1), "bx")
410            self.assertEqual(re.search(r"\B(b.)\B",
411                                       "abc bcd bc abxd", re.UNICODE).group(1), "bx")
412        self.assertEqual(re.search(r"^abc$", "\nabc\n", re.M).group(0), "abc")
413        self.assertEqual(re.search(r"^\Aabc\Z$", "abc", re.M).group(0), "abc")
414        self.assertIsNone(re.search(r"^\Aabc\Z$", "\nabc\n", re.M))
415        self.assertEqual(re.search(r"\b(b.)\b",
416                                   u"abcd abc bcd bx").group(1), "bx")
417        self.assertEqual(re.search(r"\B(b.)\B",
418                                   u"abc bcd bc abxd").group(1), "bx")
419        self.assertEqual(re.search(r"^abc$", u"\nabc\n", re.M).group(0), "abc")
420        self.assertEqual(re.search(r"^\Aabc\Z$", u"abc", re.M).group(0), "abc")
421        self.assertIsNone(re.search(r"^\Aabc\Z$", u"\nabc\n", re.M))
422        self.assertEqual(re.search(r"\d\D\w\W\s\S",
423                                   "1aa! a").group(0), "1aa! a")
424        self.assertEqual(re.search(r"\d\D\w\W\s\S",
425                                   "1aa! a", re.LOCALE).group(0), "1aa! a")
426        if have_unicode:
427            self.assertEqual(re.search(r"\d\D\w\W\s\S",
428                                       "1aa! a", re.UNICODE).group(0), "1aa! a")
429
430    def test_other_escapes(self):
431        self.assertRaises(re.error, re.compile, "\\")
432        self.assertEqual(re.match(r"\(", '(').group(), '(')
433        self.assertIsNone(re.match(r"\(", ')'))
434        self.assertEqual(re.match(r"\\", '\\').group(), '\\')
435        self.assertEqual(re.match(r"[\]]", ']').group(), ']')
436        self.assertIsNone(re.match(r"[\]]", '['))
437        self.assertEqual(re.match(r"[a\-c]", '-').group(), '-')
438        self.assertIsNone(re.match(r"[a\-c]", 'b'))
439        self.assertEqual(re.match(r"[\^a]+", 'a^').group(), 'a^')
440        self.assertIsNone(re.match(r"[\^a]+", 'b'))
441        re.purge()  # for warnings
442        for c in 'ceghijklmopquyzCEFGHIJKLMNOPQRTUVXY':
443            warn = FutureWarning if c in 'Uu' else DeprecationWarning
444            with check_py3k_warnings(('', warn)):
445                self.assertEqual(re.match('\\%c$' % c, c).group(), c)
446                self.assertIsNone(re.match('\\%c' % c, 'a'))
447        for c in 'ceghijklmopquyzABCEFGHIJKLMNOPQRTUVXYZ':
448            warn = FutureWarning if c in 'Uu' else DeprecationWarning
449            with check_py3k_warnings(('', warn)):
450                self.assertEqual(re.match('[\\%c]$' % c, c).group(), c)
451                self.assertIsNone(re.match('[\\%c]' % c, 'a'))
452
453    def test_string_boundaries(self):
454        # See http://bugs.python.org/issue10713
455        self.assertEqual(re.search(r"\b(abc)\b", "abc").group(1),
456                         "abc")
457        # There's a word boundary at the start of a string.
458        self.assertTrue(re.match(r"\b", "abc"))
459        # A non-empty string includes a non-boundary zero-length match.
460        self.assertTrue(re.search(r"\B", "abc"))
461        # There is no non-boundary match at the start of a string.
462        self.assertFalse(re.match(r"\B", "abc"))
463        # However, an empty string contains no word boundaries, and also no
464        # non-boundaries.
465        self.assertIsNone(re.search(r"\B", ""))
466        # This one is questionable and different from the perlre behaviour,
467        # but describes current behavior.
468        self.assertIsNone(re.search(r"\b", ""))
469        # A single word-character string has two boundaries, but no
470        # non-boundary gaps.
471        self.assertEqual(len(re.findall(r"\b", "a")), 2)
472        self.assertEqual(len(re.findall(r"\B", "a")), 0)
473        # If there are no words, there are no boundaries
474        self.assertEqual(len(re.findall(r"\b", " ")), 0)
475        self.assertEqual(len(re.findall(r"\b", "   ")), 0)
476        # Can match around the whitespace.
477        self.assertEqual(len(re.findall(r"\B", " ")), 2)
478
479    @requires_unicode
480    def test_bigcharset(self):
481        self.assertEqual(re.match(u(r"([\u2222\u2223])"),
482                                  unichr(0x2222)).group(1), unichr(0x2222))
483        self.assertEqual(re.match(u(r"([\u2222\u2223])"),
484                                  unichr(0x2222), re.UNICODE).group(1), unichr(0x2222))
485        r = u'[%s]' % u''.join(map(unichr, range(256, 2**16, 255)))
486        self.assertEqual(re.match(r, unichr(0xff01), re.UNICODE).group(), unichr(0xff01))
487
488    def test_big_codesize(self):
489        # Issue #1160
490        r = re.compile('|'.join(('%d'%x for x in range(10000))))
491        self.assertTrue(r.match('1000'))
492        self.assertTrue(r.match('9999'))
493
494    def test_anyall(self):
495        self.assertEqual(re.match("a.b", "a\nb", re.DOTALL).group(0),
496                         "a\nb")
497        self.assertEqual(re.match("a.*b", "a\n\nb", re.DOTALL).group(0),
498                         "a\n\nb")
499
500    def test_lookahead(self):
501        self.assertEqual(re.match("(a(?=\s[^a]))", "a b").group(1), "a")
502        self.assertEqual(re.match("(a(?=\s[^a]*))", "a b").group(1), "a")
503        self.assertEqual(re.match("(a(?=\s[abc]))", "a b").group(1), "a")
504        self.assertEqual(re.match("(a(?=\s[abc]*))", "a bc").group(1), "a")
505        self.assertEqual(re.match(r"(a)(?=\s\1)", "a a").group(1), "a")
506        self.assertEqual(re.match(r"(a)(?=\s\1*)", "a aa").group(1), "a")
507        self.assertEqual(re.match(r"(a)(?=\s(abc|a))", "a a").group(1), "a")
508
509        self.assertEqual(re.match(r"(a(?!\s[^a]))", "a a").group(1), "a")
510        self.assertEqual(re.match(r"(a(?!\s[abc]))", "a d").group(1), "a")
511        self.assertEqual(re.match(r"(a)(?!\s\1)", "a b").group(1), "a")
512        self.assertEqual(re.match(r"(a)(?!\s(abc|a))", "a b").group(1), "a")
513
514        # Group reference.
515        self.assertTrue(re.match(r'(a)b(?=\1)a', 'aba'))
516        self.assertIsNone(re.match(r'(a)b(?=\1)c', 'abac'))
517        # Named group reference.
518        self.assertTrue(re.match(r'(?P<g>a)b(?=(?P=g))a', 'aba'))
519        self.assertIsNone(re.match(r'(?P<g>a)b(?=(?P=g))c', 'abac'))
520        # Conditional group reference.
521        self.assertTrue(re.match(r'(?:(a)|(x))b(?=(?(2)x|c))c', 'abc'))
522        self.assertIsNone(re.match(r'(?:(a)|(x))b(?=(?(2)c|x))c', 'abc'))
523        self.assertTrue(re.match(r'(?:(a)|(x))b(?=(?(2)x|c))c', 'abc'))
524        self.assertIsNone(re.match(r'(?:(a)|(x))b(?=(?(1)b|x))c', 'abc'))
525        self.assertTrue(re.match(r'(?:(a)|(x))b(?=(?(1)c|x))c', 'abc'))
526        # Group used before defined.
527        self.assertTrue(re.match(r'(a)b(?=(?(2)x|c))(c)', 'abc'))
528        self.assertIsNone(re.match(r'(a)b(?=(?(2)b|x))(c)', 'abc'))
529        self.assertTrue(re.match(r'(a)b(?=(?(1)c|x))(c)', 'abc'))
530
531    def test_lookbehind(self):
532        self.assertTrue(re.match(r'ab(?<=b)c', 'abc'))
533        self.assertIsNone(re.match(r'ab(?<=c)c', 'abc'))
534        self.assertIsNone(re.match(r'ab(?<!b)c', 'abc'))
535        self.assertTrue(re.match(r'ab(?<!c)c', 'abc'))
536        # Group reference.
537        with check_warnings(('', RuntimeWarning)):
538            re.compile(r'(a)a(?<=\1)c')
539        # Named group reference.
540        with check_warnings(('', RuntimeWarning)):
541            re.compile(r'(?P<g>a)a(?<=(?P=g))c')
542        # Conditional group reference.
543        with check_warnings(('', RuntimeWarning)):
544            re.compile(r'(a)b(?<=(?(1)b|x))c')
545        # Group used before defined.
546        with check_warnings(('', RuntimeWarning)):
547            re.compile(r'(a)b(?<=(?(2)b|x))(c)')
548
549    def test_ignore_case(self):
550        self.assertEqual(re.match("abc", "ABC", re.I).group(0), "ABC")
551        self.assertEqual(re.match("abc", u"ABC", re.I).group(0), "ABC")
552        self.assertEqual(re.match(r"(a\s[^a])", "a b", re.I).group(1), "a b")
553        self.assertEqual(re.match(r"(a\s[^a]*)", "a bb", re.I).group(1), "a bb")
554        self.assertEqual(re.match(r"(a\s[abc])", "a b", re.I).group(1), "a b")
555        self.assertEqual(re.match(r"(a\s[abc]*)", "a bb", re.I).group(1), "a bb")
556        self.assertEqual(re.match(r"((a)\s\2)", "a a", re.I).group(1), "a a")
557        self.assertEqual(re.match(r"((a)\s\2*)", "a aa", re.I).group(1), "a aa")
558        self.assertEqual(re.match(r"((a)\s(abc|a))", "a a", re.I).group(1), "a a")
559        self.assertEqual(re.match(r"((a)\s(abc|a)*)", "a aa", re.I).group(1), "a aa")
560
561        if have_unicode:
562            assert u(r'\u212a').lower() == u'k' # 'K'
563            self.assertTrue(re.match(ur'K', u(r'\u212a'), re.U | re.I))
564            self.assertTrue(re.match(ur'k', u(r'\u212a'), re.U | re.I))
565            self.assertTrue(re.match(u(r'\u212a'), u'K', re.U | re.I))
566            self.assertTrue(re.match(u(r'\u212a'), u'k', re.U | re.I))
567            assert u(r'\u017f').upper() == u'S' # 'ſ'
568            self.assertTrue(re.match(ur'S', u(r'\u017f'), re.U | re.I))
569            self.assertTrue(re.match(ur's', u(r'\u017f'), re.U | re.I))
570            self.assertTrue(re.match(u(r'\u017f'), u'S', re.U | re.I))
571            self.assertTrue(re.match(u(r'\u017f'), u's', re.U | re.I))
572
573    def test_ignore_case_set(self):
574        self.assertTrue(re.match(r'[19A]', 'A', re.I))
575        self.assertTrue(re.match(r'[19a]', 'a', re.I))
576        self.assertTrue(re.match(r'[19a]', 'A', re.I))
577        self.assertTrue(re.match(r'[19A]', 'a', re.I))
578        if have_unicode:
579            self.assertTrue(re.match(ur'[19A]', u'A', re.U | re.I))
580            self.assertTrue(re.match(ur'[19a]', u'a', re.U | re.I))
581            self.assertTrue(re.match(ur'[19a]', u'A', re.U | re.I))
582            self.assertTrue(re.match(ur'[19A]', u'a', re.U | re.I))
583            assert u(r'\u212a').lower() == u'k' # 'K'
584            self.assertTrue(re.match(u(r'[19K]'), u(r'\u212a'), re.U | re.I))
585            self.assertTrue(re.match(u(r'[19k]'), u(r'\u212a'), re.U | re.I))
586            self.assertTrue(re.match(u(r'[19\u212a]'), u'K', re.U | re.I))
587            self.assertTrue(re.match(u(r'[19\u212a]'), u'k', re.U | re.I))
588            assert u(r'\u017f').upper() == u'S' # 'ſ'
589            self.assertTrue(re.match(ur'[19S]', u(r'\u017f'), re.U | re.I))
590            self.assertTrue(re.match(ur'[19s]', u(r'\u017f'), re.U | re.I))
591            self.assertTrue(re.match(u(r'[19\u017f]'), u'S', re.U | re.I))
592            self.assertTrue(re.match(u(r'[19\u017f]'), u's', re.U | re.I))
593
594    def test_ignore_case_range(self):
595        # Issues #3511, #17381.
596        self.assertTrue(re.match(r'[9-a]', '_', re.I))
597        self.assertIsNone(re.match(r'[9-A]', '_', re.I))
598        self.assertTrue(re.match(r'[\xc0-\xde]', '\xd7', re.I))
599        self.assertIsNone(re.match(r'[\xc0-\xde]', '\xf7', re.I))
600        self.assertTrue(re.match(r'[\xe0-\xfe]', '\xf7',re.I))
601        self.assertIsNone(re.match(r'[\xe0-\xfe]', '\xd7', re.I))
602        if have_unicode:
603            self.assertTrue(re.match(u(r'[9-a]'), u(r'_'), re.U | re.I))
604            self.assertIsNone(re.match(u(r'[9-A]'), u(r'_'), re.U | re.I))
605            self.assertTrue(re.match(u(r'[\xc0-\xde]'),
606                                     u(r'\xd7'), re.U | re.I))
607            self.assertIsNone(re.match(u(r'[\xc0-\xde]'),
608                                       u(r'\xf7'), re.U | re.I))
609            self.assertTrue(re.match(u(r'[\xe0-\xfe]'),
610                                     u(r'\xf7'), re.U | re.I))
611            self.assertIsNone(re.match(u(r'[\xe0-\xfe]'),
612                                       u(r'\xd7'), re.U | re.I))
613            self.assertTrue(re.match(u(r'[\u0430-\u045f]'),
614                                     u(r'\u0450'), re.U | re.I))
615            self.assertTrue(re.match(u(r'[\u0430-\u045f]'),
616                                     u(r'\u0400'), re.U | re.I))
617            self.assertTrue(re.match(u(r'[\u0400-\u042f]'),
618                                     u(r'\u0450'), re.U | re.I))
619            self.assertTrue(re.match(u(r'[\u0400-\u042f]'),
620                                     u(r'\u0400'), re.U | re.I))
621            if sys.maxunicode > 0xffff:
622                self.assertTrue(re.match(u(r'[\U00010428-\U0001044f]'),
623                                         u(r'\U00010428'), re.U | re.I))
624                self.assertTrue(re.match(u(r'[\U00010428-\U0001044f]'),
625                                         u(r'\U00010400'), re.U | re.I))
626                self.assertTrue(re.match(u(r'[\U00010400-\U00010427]'),
627                                         u(r'\U00010428'), re.U | re.I))
628                self.assertTrue(re.match(u(r'[\U00010400-\U00010427]'),
629                                         u(r'\U00010400'), re.U | re.I))
630
631            assert u(r'\u212a').lower() == u'k' # 'K'
632            self.assertTrue(re.match(ur'[J-M]', u(r'\u212a'), re.U | re.I))
633            self.assertTrue(re.match(ur'[j-m]', u(r'\u212a'), re.U | re.I))
634            self.assertTrue(re.match(u(r'[\u2129-\u212b]'), u'K', re.U | re.I))
635            self.assertTrue(re.match(u(r'[\u2129-\u212b]'), u'k', re.U | re.I))
636            assert u(r'\u017f').upper() == u'S' # 'ſ'
637            self.assertTrue(re.match(ur'[R-T]', u(r'\u017f'), re.U | re.I))
638            self.assertTrue(re.match(ur'[r-t]', u(r'\u017f'), re.U | re.I))
639            self.assertTrue(re.match(u(r'[\u017e-\u0180]'), u'S', re.U | re.I))
640            self.assertTrue(re.match(u(r'[\u017e-\u0180]'), u's', re.U | re.I))
641
642    def test_category(self):
643        self.assertEqual(re.match(r"(\s)", " ").group(1), " ")
644
645    def test_getlower(self):
646        import _sre
647        self.assertEqual(_sre.getlower(ord('A'), 0), ord('a'))
648        self.assertEqual(_sre.getlower(ord('A'), re.LOCALE), ord('a'))
649        if have_unicode:
650            self.assertEqual(_sre.getlower(ord('A'), re.UNICODE), ord('a'))
651
652        self.assertEqual(re.match("abc", "ABC", re.I).group(0), "ABC")
653        self.assertEqual(re.match("abc", u"ABC", re.I).group(0), "ABC")
654
655    def test_not_literal(self):
656        self.assertEqual(re.search("\s([^a])", " b").group(1), "b")
657        self.assertEqual(re.search("\s([^a]*)", " bb").group(1), "bb")
658
659    def test_search_coverage(self):
660        self.assertEqual(re.search("\s(b)", " b").group(1), "b")
661        self.assertEqual(re.search("a\s", "a ").group(0), "a ")
662
663    def assertMatch(self, pattern, text, match=None, span=None,
664                    matcher=re.match):
665        if match is None and span is None:
666            # the pattern matches the whole text
667            match = text
668            span = (0, len(text))
669        elif match is None or span is None:
670            raise ValueError('If match is not None, span should be specified '
671                             '(and vice versa).')
672        m = matcher(pattern, text)
673        self.assertTrue(m)
674        self.assertEqual(m.group(), match)
675        self.assertEqual(m.span(), span)
676
677    @requires_unicode
678    def test_re_escape(self):
679        alnum_chars = unicode(string.ascii_letters + string.digits)
680        p = u''.join(unichr(i) for i in range(256))
681        for c in p:
682            if c in alnum_chars:
683                self.assertEqual(re.escape(c), c)
684            elif c == u'\x00':
685                self.assertEqual(re.escape(c), u'\\000')
686            else:
687                self.assertEqual(re.escape(c), u'\\' + c)
688            self.assertMatch(re.escape(c), c)
689        self.assertMatch(re.escape(p), p)
690
691    def test_re_escape_byte(self):
692        alnum_chars = string.ascii_letters + string.digits
693        p = ''.join(chr(i) for i in range(256))
694        for b in p:
695            if b in alnum_chars:
696                self.assertEqual(re.escape(b), b)
697            elif b == b'\x00':
698                self.assertEqual(re.escape(b), b'\\000')
699            else:
700                self.assertEqual(re.escape(b), b'\\' + b)
701            self.assertMatch(re.escape(b), b)
702        self.assertMatch(re.escape(p), p)
703
704    @requires_unicode
705    def test_re_escape_non_ascii(self):
706        s = u(r'xxx\u2620\u2620\u2620xxx')
707        s_escaped = re.escape(s)
708        self.assertEqual(s_escaped, u(r'xxx\\\u2620\\\u2620\\\u2620xxx'))
709        self.assertMatch(s_escaped, s)
710        self.assertMatch(u'.%s+.' % re.escape(unichr(0x2620)), s,
711                         u(r'x\u2620\u2620\u2620x'), (2, 7), re.search)
712
713    def test_re_escape_non_ascii_bytes(self):
714        b = b'y\xe2\x98\xa0y\xe2\x98\xa0y'
715        b_escaped = re.escape(b)
716        self.assertEqual(b_escaped, b'y\\\xe2\\\x98\\\xa0y\\\xe2\\\x98\\\xa0y')
717        self.assertMatch(b_escaped, b)
718        res = re.findall(re.escape(b'\xe2\x98\xa0'), b)
719        self.assertEqual(len(res), 2)
720
721    def test_pickling(self):
722        import pickle
723        self.pickle_test(pickle)
724        import cPickle
725        self.pickle_test(cPickle)
726        # old pickles expect the _compile() reconstructor in sre module
727        import_module("sre", deprecated=True)
728        from sre import _compile
729        # current pickle expects the _compile() reconstructor in re module
730        from re import _compile
731
732    def pickle_test(self, pickle):
733        oldpat = re.compile('a(?:b|(c|e){1,2}?|d)+?(.)')
734        for proto in range(pickle.HIGHEST_PROTOCOL + 1):
735            pickled = pickle.dumps(oldpat, proto)
736            newpat = pickle.loads(pickled)
737            self.assertEqual(newpat, oldpat)
738
739    def test_constants(self):
740        self.assertEqual(re.I, re.IGNORECASE)
741        self.assertEqual(re.L, re.LOCALE)
742        self.assertEqual(re.M, re.MULTILINE)
743        self.assertEqual(re.S, re.DOTALL)
744        self.assertEqual(re.X, re.VERBOSE)
745
746    def test_flags(self):
747        for flag in [re.I, re.M, re.X, re.S, re.L]:
748            self.assertTrue(re.compile('^pattern$', flag))
749
750    def test_sre_character_literals(self):
751        for i in [0, 8, 16, 32, 64, 127, 128, 255]:
752            self.assertTrue(re.match(r"\%03o" % i, chr(i)))
753            self.assertTrue(re.match(r"\%03o0" % i, chr(i)+"0"))
754            self.assertTrue(re.match(r"\%03o8" % i, chr(i)+"8"))
755            self.assertTrue(re.match(r"\x%02x" % i, chr(i)))
756            self.assertTrue(re.match(r"\x%02x0" % i, chr(i)+"0"))
757            self.assertTrue(re.match(r"\x%02xz" % i, chr(i)+"z"))
758        self.assertRaises(re.error, re.match, "\911", "")
759
760    def test_sre_character_class_literals(self):
761        for i in [0, 8, 16, 32, 64, 127, 128, 255]:
762            self.assertTrue(re.match(r"[\%03o]" % i, chr(i)))
763            self.assertTrue(re.match(r"[\%03o0]" % i, chr(i)))
764            self.assertTrue(re.match(r"[\%03o8]" % i, chr(i)))
765            self.assertTrue(re.match(r"[\x%02x]" % i, chr(i)))
766            self.assertTrue(re.match(r"[\x%02x0]" % i, chr(i)))
767            self.assertTrue(re.match(r"[\x%02xz]" % i, chr(i)))
768        self.assertRaises(re.error, re.match, "[\911]", "")
769
770    def test_bug_113254(self):
771        self.assertEqual(re.match(r'(a)|(b)', 'b').start(1), -1)
772        self.assertEqual(re.match(r'(a)|(b)', 'b').end(1), -1)
773        self.assertEqual(re.match(r'(a)|(b)', 'b').span(1), (-1, -1))
774
775    def test_bug_527371(self):
776        # bug described in patches 527371/672491
777        self.assertIsNone(re.match(r'(a)?a','a').lastindex)
778        self.assertEqual(re.match(r'(a)(b)?b','ab').lastindex, 1)
779        self.assertEqual(re.match(r'(?P<a>a)(?P<b>b)?b','ab').lastgroup, 'a')
780        self.assertEqual(re.match("(?P<a>a(b))", "ab").lastgroup, 'a')
781        self.assertEqual(re.match("((a))", "a").lastindex, 1)
782
783    def test_bug_545855(self):
784        # bug 545855 -- This pattern failed to cause a compile error as it
785        # should, instead provoking a TypeError.
786        self.assertRaises(re.error, re.compile, 'foo[a-')
787
788    def test_bug_418626(self):
789        # bugs 418626 at al. -- Testing Greg Chapman's addition of op code
790        # SRE_OP_MIN_REPEAT_ONE for eliminating recursion on simple uses of
791        # pattern '*?' on a long string.
792        self.assertEqual(re.match('.*?c', 10000*'ab'+'cd').end(0), 20001)
793        self.assertEqual(re.match('.*?cd', 5000*'ab'+'c'+5000*'ab'+'cde').end(0),
794                         20003)
795        self.assertEqual(re.match('.*?cd', 20000*'abc'+'de').end(0), 60001)
796        # non-simple '*?' still used to hit the recursion limit, before the
797        # non-recursive scheme was implemented.
798        self.assertEqual(re.search('(a|b)*?c', 10000*'ab'+'cd').end(0), 20001)
799
800    @requires_unicode
801    def test_bug_612074(self):
802        pat=u"["+re.escape(unichr(0x2039))+u"]"
803        self.assertEqual(re.compile(pat) and 1, 1)
804
805    def test_stack_overflow(self):
806        # nasty cases that used to overflow the straightforward recursive
807        # implementation of repeated groups.
808        self.assertEqual(re.match('(x)*', 50000*'x').group(1), 'x')
809        self.assertEqual(re.match('(x)*y', 50000*'x'+'y').group(1), 'x')
810        self.assertEqual(re.match('(x)*?y', 50000*'x'+'y').group(1), 'x')
811
812    def test_unlimited_zero_width_repeat(self):
813        # Issue #9669
814        self.assertIsNone(re.match(r'(?:a?)*y', 'z'))
815        self.assertIsNone(re.match(r'(?:a?)+y', 'z'))
816        self.assertIsNone(re.match(r'(?:a?){2,}y', 'z'))
817        self.assertIsNone(re.match(r'(?:a?)*?y', 'z'))
818        self.assertIsNone(re.match(r'(?:a?)+?y', 'z'))
819        self.assertIsNone(re.match(r'(?:a?){2,}?y', 'z'))
820
821    def test_scanner(self):
822        def s_ident(scanner, token): return token
823        def s_operator(scanner, token): return "op%s" % token
824        def s_float(scanner, token): return float(token)
825        def s_int(scanner, token): return int(token)
826
827        scanner = Scanner([
828            (r"[a-zA-Z_]\w*", s_ident),
829            (r"\d+\.\d*", s_float),
830            (r"\d+", s_int),
831            (r"=|\+|-|\*|/", s_operator),
832            (r"\s+", None),
833            ])
834
835        self.assertTrue(scanner.scanner.scanner("").pattern)
836
837        self.assertEqual(scanner.scan("sum = 3*foo + 312.50 + bar"),
838                         (['sum', 'op=', 3, 'op*', 'foo', 'op+', 312.5,
839                           'op+', 'bar'], ''))
840
841    def test_bug_448951(self):
842        # bug 448951 (similar to 429357, but with single char match)
843        # (Also test greedy matches.)
844        for op in '','?','*':
845            self.assertEqual(re.match(r'((.%s):)?z'%op, 'z').groups(),
846                             (None, None))
847            self.assertEqual(re.match(r'((.%s):)?z'%op, 'a:z').groups(),
848                             ('a:', 'a'))
849
850    def test_bug_725106(self):
851        # capturing groups in alternatives in repeats
852        self.assertEqual(re.match('^((a)|b)*', 'abc').groups(),
853                         ('b', 'a'))
854        self.assertEqual(re.match('^(([ab])|c)*', 'abc').groups(),
855                         ('c', 'b'))
856        self.assertEqual(re.match('^((d)|[ab])*', 'abc').groups(),
857                         ('b', None))
858        self.assertEqual(re.match('^((a)c|[ab])*', 'abc').groups(),
859                         ('b', None))
860        self.assertEqual(re.match('^((a)|b)*?c', 'abc').groups(),
861                         ('b', 'a'))
862        self.assertEqual(re.match('^(([ab])|c)*?d', 'abcd').groups(),
863                         ('c', 'b'))
864        self.assertEqual(re.match('^((d)|[ab])*?c', 'abc').groups(),
865                         ('b', None))
866        self.assertEqual(re.match('^((a)c|[ab])*?c', 'abc').groups(),
867                         ('b', None))
868
869    def test_bug_725149(self):
870        # mark_stack_base restoring before restoring marks
871        self.assertEqual(re.match('(a)(?:(?=(b)*)c)*', 'abb').groups(),
872                         ('a', None))
873        self.assertEqual(re.match('(a)((?!(b)*))*', 'abb').groups(),
874                         ('a', None, None))
875
876    @requires_unicode
877    def test_bug_764548(self):
878        # bug 764548, re.compile() barfs on str/unicode subclasses
879        class my_unicode(unicode): pass
880        pat = re.compile(my_unicode("abc"))
881        self.assertIsNone(pat.match("xyz"))
882
883    def test_finditer(self):
884        iter = re.finditer(r":+", "a:b::c:::d")
885        self.assertEqual([item.group(0) for item in iter],
886                         [":", "::", ":::"])
887
888    @requires_unicode
889    def test_bug_926075(self):
890        self.assertIsNot(re.compile('bug_926075'),
891                         re.compile(u'bug_926075'))
892
893    @requires_unicode
894    def test_bug_931848(self):
895        pattern = u(r"[\u002E\u3002\uFF0E\uFF61]")
896        self.assertEqual(re.compile(pattern).split("a.b.c"),
897                         ['a','b','c'])
898
899    def test_bug_581080(self):
900        iter = re.finditer(r"\s", "a b")
901        self.assertEqual(iter.next().span(), (1,2))
902        self.assertRaises(StopIteration, iter.next)
903
904        scanner = re.compile(r"\s").scanner("a b")
905        self.assertEqual(scanner.search().span(), (1, 2))
906        self.assertIsNone(scanner.search())
907
908    def test_bug_817234(self):
909        iter = re.finditer(r".*", "asdf")
910        self.assertEqual(iter.next().span(), (0, 4))
911        self.assertEqual(iter.next().span(), (4, 4))
912        self.assertRaises(StopIteration, iter.next)
913
914    @requires_unicode
915    def test_bug_6561(self):
916        # '\d' should match characters in Unicode category 'Nd'
917        # (Number, Decimal Digit), but not those in 'Nl' (Number,
918        # Letter) or 'No' (Number, Other).
919        decimal_digits = [
920            unichr(0x0037), # '\N{DIGIT SEVEN}', category 'Nd'
921            unichr(0x0e58), # '\N{THAI DIGIT SIX}', category 'Nd'
922            unichr(0xff10), # '\N{FULLWIDTH DIGIT ZERO}', category 'Nd'
923            ]
924        for x in decimal_digits:
925            self.assertEqual(re.match('^\d$', x, re.UNICODE).group(0), x)
926
927        not_decimal_digits = [
928            unichr(0x2165), # '\N{ROMAN NUMERAL SIX}', category 'Nl'
929            unichr(0x3039), # '\N{HANGZHOU NUMERAL TWENTY}', category 'Nl'
930            unichr(0x2082), # '\N{SUBSCRIPT TWO}', category 'No'
931            unichr(0x32b4), # '\N{CIRCLED NUMBER THIRTY NINE}', category 'No'
932            ]
933        for x in not_decimal_digits:
934            self.assertIsNone(re.match('^\d$', x, re.UNICODE))
935
936    def test_empty_array(self):
937        # SF buf 1647541
938        import array
939        typecodes = 'cbBhHiIlLfd'
940        if have_unicode:
941            typecodes += 'u'
942        for typecode in typecodes:
943            a = array.array(typecode)
944            self.assertIsNone(re.compile("bla").match(a))
945            self.assertEqual(re.compile("").match(a).groups(), ())
946
947    @requires_unicode
948    def test_inline_flags(self):
949        # Bug #1700
950        upper_char = unichr(0x1ea0) # Latin Capital Letter A with Dot Bellow
951        lower_char = unichr(0x1ea1) # Latin Small Letter A with Dot Bellow
952
953        p = re.compile(upper_char, re.I | re.U)
954        q = p.match(lower_char)
955        self.assertTrue(q)
956
957        p = re.compile(lower_char, re.I | re.U)
958        q = p.match(upper_char)
959        self.assertTrue(q)
960
961        p = re.compile('(?i)' + upper_char, re.U)
962        q = p.match(lower_char)
963        self.assertTrue(q)
964
965        p = re.compile('(?i)' + lower_char, re.U)
966        q = p.match(upper_char)
967        self.assertTrue(q)
968
969        p = re.compile('(?iu)' + upper_char)
970        q = p.match(lower_char)
971        self.assertTrue(q)
972
973        p = re.compile('(?iu)' + lower_char)
974        q = p.match(upper_char)
975        self.assertTrue(q)
976
977        self.assertTrue(re.match('(?ixu) ' + upper_char, lower_char))
978        self.assertTrue(re.match('(?ixu) ' + lower_char, upper_char))
979
980        # Incompatibilities
981        re.purge()
982        with check_py3k_warnings():
983            re.compile('', re.LOCALE|re.UNICODE)
984        with check_py3k_warnings():
985            re.compile('(?L)', re.UNICODE)
986        with check_py3k_warnings():
987            re.compile('(?u)', re.LOCALE)
988        with check_py3k_warnings():
989            re.compile('(?Lu)')
990        with check_py3k_warnings():
991            re.compile('(?uL)')
992
993    def test_dollar_matches_twice(self):
994        "$ matches the end of string, and just before the terminating \n"
995        pattern = re.compile('$')
996        self.assertEqual(pattern.sub('#', 'a\nb\n'), 'a\nb#\n#')
997        self.assertEqual(pattern.sub('#', 'a\nb\nc'), 'a\nb\nc#')
998        self.assertEqual(pattern.sub('#', '\n'), '#\n#')
999
1000        pattern = re.compile('$', re.MULTILINE)
1001        self.assertEqual(pattern.sub('#', 'a\nb\n' ), 'a#\nb#\n#' )
1002        self.assertEqual(pattern.sub('#', 'a\nb\nc'), 'a#\nb#\nc#')
1003        self.assertEqual(pattern.sub('#', '\n'), '#\n#')
1004
1005    def test_dealloc(self):
1006        # issue 3299: check for segfault in debug build
1007        import _sre
1008        # the overflow limit is different on wide and narrow builds and it
1009        # depends on the definition of SRE_CODE (see sre.h).
1010        # 2**128 should be big enough to overflow on both. For smaller values
1011        # a RuntimeError is raised instead of OverflowError.
1012        long_overflow = 2**128
1013        self.assertRaises(TypeError, re.finditer, "a", {})
1014        self.assertRaises(OverflowError, _sre.compile, "abc", 0, [long_overflow])
1015
1016    def test_compile(self):
1017        # Test return value when given string and pattern as parameter
1018        pattern = re.compile('random pattern')
1019        self.assertIsInstance(pattern, re._pattern_type)
1020        same_pattern = re.compile(pattern)
1021        self.assertIsInstance(same_pattern, re._pattern_type)
1022        self.assertIs(same_pattern, pattern)
1023        # Test behaviour when not given a string or pattern as parameter
1024        self.assertRaises(TypeError, re.compile, 0)
1025
1026    def test_bug_13899(self):
1027        # Issue #13899: re pattern r"[\A]" should work like "A" but matches
1028        # nothing. Ditto B and Z.
1029        with check_py3k_warnings():
1030            self.assertEqual(re.findall(r'[\A\B\b\C\Z]', 'AB\bCZ'),
1031                             ['A', 'B', '\b', 'C', 'Z'])
1032
1033    @precisionbigmemtest(size=_2G, memuse=1)
1034    def test_large_search(self, size):
1035        # Issue #10182: indices were 32-bit-truncated.
1036        s = 'a' * size
1037        m = re.search('$', s)
1038        self.assertIsNotNone(m)
1039        self.assertEqual(m.start(), size)
1040        self.assertEqual(m.end(), size)
1041
1042    # The huge memuse is because of re.sub() using a list and a join()
1043    # to create the replacement result.
1044    @precisionbigmemtest(size=_2G, memuse=16 + 2)
1045    def test_large_subn(self, size):
1046        # Issue #10182: indices were 32-bit-truncated.
1047        s = 'a' * size
1048        r, n = re.subn('', '', s)
1049        self.assertEqual(r, s)
1050        self.assertEqual(n, size + 1)
1051
1052
1053    def test_repeat_minmax_overflow(self):
1054        # Issue #13169
1055        string = "x" * 100000
1056        self.assertEqual(re.match(r".{65535}", string).span(), (0, 65535))
1057        self.assertEqual(re.match(r".{,65535}", string).span(), (0, 65535))
1058        self.assertEqual(re.match(r".{65535,}?", string).span(), (0, 65535))
1059        self.assertEqual(re.match(r".{65536}", string).span(), (0, 65536))
1060        self.assertEqual(re.match(r".{,65536}", string).span(), (0, 65536))
1061        self.assertEqual(re.match(r".{65536,}?", string).span(), (0, 65536))
1062        # 2**128 should be big enough to overflow both SRE_CODE and Py_ssize_t.
1063        self.assertRaises(OverflowError, re.compile, r".{%d}" % 2**128)
1064        self.assertRaises(OverflowError, re.compile, r".{,%d}" % 2**128)
1065        self.assertRaises(OverflowError, re.compile, r".{%d,}?" % 2**128)
1066        self.assertRaises(OverflowError, re.compile, r".{%d,%d}" % (2**129, 2**128))
1067
1068    @cpython_only
1069    def test_repeat_minmax_overflow_maxrepeat(self):
1070        try:
1071            from _sre import MAXREPEAT
1072        except ImportError:
1073            self.skipTest('requires _sre.MAXREPEAT constant')
1074        string = "x" * 100000
1075        self.assertIsNone(re.match(r".{%d}" % (MAXREPEAT - 1), string))
1076        self.assertEqual(re.match(r".{,%d}" % (MAXREPEAT - 1), string).span(),
1077                         (0, 100000))
1078        self.assertIsNone(re.match(r".{%d,}?" % (MAXREPEAT - 1), string))
1079        self.assertRaises(OverflowError, re.compile, r".{%d}" % MAXREPEAT)
1080        self.assertRaises(OverflowError, re.compile, r".{,%d}" % MAXREPEAT)
1081        self.assertRaises(OverflowError, re.compile, r".{%d,}?" % MAXREPEAT)
1082
1083    def test_backref_group_name_in_exception(self):
1084        # Issue 17341: Poor error message when compiling invalid regex
1085        with self.assertRaisesRegexp(sre_constants.error, '<foo>'):
1086            re.compile('(?P=<foo>)')
1087
1088    def test_group_name_in_exception(self):
1089        # Issue 17341: Poor error message when compiling invalid regex
1090        with self.assertRaisesRegexp(sre_constants.error, '\?foo'):
1091            re.compile('(?P<?foo>)')
1092
1093    def test_issue17998(self):
1094        for reps in '*', '+', '?', '{1}':
1095            for mod in '', '?':
1096                pattern = '.' + reps + mod + 'yz'
1097                self.assertEqual(re.compile(pattern, re.S).findall('xyz'),
1098                                 ['xyz'], msg=pattern)
1099                if have_unicode:
1100                    pattern = unicode(pattern)
1101                    self.assertEqual(re.compile(pattern, re.S).findall(u'xyz'),
1102                                     [u'xyz'], msg=pattern)
1103
1104
1105    def test_bug_2537(self):
1106        # issue 2537: empty submatches
1107        for outer_op in ('{0,}', '*', '+', '{1,187}'):
1108            for inner_op in ('{0,}', '*', '?'):
1109                r = re.compile("^((x|y)%s)%s" % (inner_op, outer_op))
1110                m = r.match("xyyzy")
1111                self.assertEqual(m.group(0), "xyy")
1112                self.assertEqual(m.group(1), "")
1113                self.assertEqual(m.group(2), "y")
1114
1115    def test_debug_flag(self):
1116        pat = r'(\.)(?:[ch]|py)(?(1)$|: )'
1117        with captured_stdout() as out:
1118            re.compile(pat, re.DEBUG)
1119        dump = '''\
1120subpattern 1
1121  literal 46
1122subpattern None
1123  branch
1124    in
1125      literal 99
1126      literal 104
1127  or
1128    literal 112
1129    literal 121
1130subpattern None
1131  groupref_exists 1
1132    at at_end
1133  else
1134    literal 58
1135    literal 32
1136'''
1137        self.assertEqual(out.getvalue(), dump)
1138        # Debug output is output again even a second time (bypassing
1139        # the cache -- issue #20426).
1140        with captured_stdout() as out:
1141            re.compile(pat, re.DEBUG)
1142        self.assertEqual(out.getvalue(), dump)
1143
1144    def test_keyword_parameters(self):
1145        # Issue #20283: Accepting the string keyword parameter.
1146        pat = re.compile(r'(ab)')
1147        self.assertEqual(
1148            pat.match(string='abracadabra', pos=7, endpos=10).span(), (7, 9))
1149        self.assertEqual(
1150            pat.search(string='abracadabra', pos=3, endpos=10).span(), (7, 9))
1151        self.assertEqual(
1152            pat.findall(string='abracadabra', pos=3, endpos=10), ['ab'])
1153        self.assertEqual(
1154            pat.split(string='abracadabra', maxsplit=1),
1155            ['', 'ab', 'racadabra'])
1156
1157    def test_match_group_takes_long(self):
1158        self.assertEqual(re.match("(foo)", "foo").group(1L), "foo")
1159        self.assertRaises(IndexError, re.match("", "").group, sys.maxint + 1)
1160
1161    def test_locale_caching(self):
1162        # Issue #22410
1163        oldlocale = locale.setlocale(locale.LC_CTYPE)
1164        self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale)
1165        for loc in 'en_US.iso88591', 'en_US.utf8':
1166            try:
1167                locale.setlocale(locale.LC_CTYPE, loc)
1168            except locale.Error:
1169                # Unsupported locale on this system
1170                self.skipTest('test needs %s locale' % loc)
1171
1172        re.purge()
1173        self.check_en_US_iso88591()
1174        self.check_en_US_utf8()
1175        re.purge()
1176        self.check_en_US_utf8()
1177        self.check_en_US_iso88591()
1178
1179    def check_en_US_iso88591(self):
1180        locale.setlocale(locale.LC_CTYPE, 'en_US.iso88591')
1181        self.assertTrue(re.match(b'\xc5\xe5', b'\xc5\xe5', re.L|re.I))
1182        self.assertTrue(re.match(b'\xc5', b'\xe5', re.L|re.I))
1183        self.assertTrue(re.match(b'\xe5', b'\xc5', re.L|re.I))
1184        self.assertTrue(re.match(b'(?Li)\xc5\xe5', b'\xc5\xe5'))
1185        self.assertTrue(re.match(b'(?Li)\xc5', b'\xe5'))
1186        self.assertTrue(re.match(b'(?Li)\xe5', b'\xc5'))
1187
1188    def check_en_US_utf8(self):
1189        locale.setlocale(locale.LC_CTYPE, 'en_US.utf8')
1190        self.assertTrue(re.match(b'\xc5\xe5', b'\xc5\xe5', re.L|re.I))
1191        self.assertIsNone(re.match(b'\xc5', b'\xe5', re.L|re.I))
1192        self.assertIsNone(re.match(b'\xe5', b'\xc5', re.L|re.I))
1193        self.assertTrue(re.match(b'(?Li)\xc5\xe5', b'\xc5\xe5'))
1194        self.assertIsNone(re.match(b'(?Li)\xc5', b'\xe5'))
1195        self.assertIsNone(re.match(b'(?Li)\xe5', b'\xc5'))
1196
1197
1198def run_re_tests():
1199    from test.re_tests import tests, SUCCEED, FAIL, SYNTAX_ERROR
1200    if verbose:
1201        print 'Running re_tests test suite'
1202    else:
1203        # To save time, only run the first and last 10 tests
1204        #tests = tests[:10] + tests[-10:]
1205        pass
1206
1207    for t in tests:
1208        sys.stdout.flush()
1209        pattern = s = outcome = repl = expected = None
1210        if len(t) == 5:
1211            pattern, s, outcome, repl, expected = t
1212        elif len(t) == 3:
1213            pattern, s, outcome = t
1214        else:
1215            raise ValueError, ('Test tuples should have 3 or 5 fields', t)
1216
1217        try:
1218            obj = re.compile(pattern)
1219        except re.error:
1220            if outcome == SYNTAX_ERROR: pass  # Expected a syntax error
1221            else:
1222                print '=== Syntax error:', t
1223        except KeyboardInterrupt: raise KeyboardInterrupt
1224        except:
1225            print '*** Unexpected error ***', t
1226            if verbose:
1227                traceback.print_exc(file=sys.stdout)
1228        else:
1229            try:
1230                result = obj.search(s)
1231            except re.error, msg:
1232                print '=== Unexpected exception', t, repr(msg)
1233            if outcome == SYNTAX_ERROR:
1234                # This should have been a syntax error; forget it.
1235                pass
1236            elif outcome == FAIL:
1237                if result is None: pass   # No match, as expected
1238                else: print '=== Succeeded incorrectly', t
1239            elif outcome == SUCCEED:
1240                if result is not None:
1241                    # Matched, as expected, so now we compute the
1242                    # result string and compare it to our expected result.
1243                    start, end = result.span(0)
1244                    vardict={'found': result.group(0),
1245                             'groups': result.group(),
1246                             'flags': result.re.flags}
1247                    for i in range(1, 100):
1248                        try:
1249                            gi = result.group(i)
1250                            # Special hack because else the string concat fails:
1251                            if gi is None:
1252                                gi = "None"
1253                        except IndexError:
1254                            gi = "Error"
1255                        vardict['g%d' % i] = gi
1256                    for i in result.re.groupindex.keys():
1257                        try:
1258                            gi = result.group(i)
1259                            if gi is None:
1260                                gi = "None"
1261                        except IndexError:
1262                            gi = "Error"
1263                        vardict[i] = gi
1264                    repl = eval(repl, vardict)
1265                    if repl != expected:
1266                        print '=== grouping error', t,
1267                        print repr(repl) + ' should be ' + repr(expected)
1268                else:
1269                    print '=== Failed incorrectly', t
1270
1271                # Try the match on a unicode string, and check that it
1272                # still succeeds.
1273                try:
1274                    result = obj.search(unicode(s, "latin-1"))
1275                    if result is None:
1276                        print '=== Fails on unicode match', t
1277                except NameError:
1278                    continue # 1.5.2
1279                except TypeError:
1280                    continue # unicode test case
1281
1282                # Try the match on a unicode pattern, and check that it
1283                # still succeeds.
1284                obj=re.compile(unicode(pattern, "latin-1"))
1285                result = obj.search(s)
1286                if result is None:
1287                    print '=== Fails on unicode pattern match', t
1288
1289                # Try the match with the search area limited to the extent
1290                # of the match and see if it still succeeds.  \B will
1291                # break (because it won't match at the end or start of a
1292                # string), so we'll ignore patterns that feature it.
1293
1294                if pattern[:2] != '\\B' and pattern[-2:] != '\\B' \
1295                               and result is not None:
1296                    obj = re.compile(pattern)
1297                    result = obj.search(s, result.start(0), result.end(0) + 1)
1298                    if result is None:
1299                        print '=== Failed on range-limited match', t
1300
1301                # Try the match with IGNORECASE enabled, and check that it
1302                # still succeeds.
1303                obj = re.compile(pattern, re.IGNORECASE)
1304                result = obj.search(s)
1305                if result is None:
1306                    print '=== Fails on case-insensitive match', t
1307
1308                # Try the match with LOCALE enabled, and check that it
1309                # still succeeds.
1310                obj = re.compile(pattern, re.LOCALE)
1311                result = obj.search(s)
1312                if result is None:
1313                    print '=== Fails on locale-sensitive match', t
1314
1315                # Try the match with UNICODE locale enabled, and check
1316                # that it still succeeds.
1317                obj = re.compile(pattern, re.UNICODE)
1318                result = obj.search(s)
1319                if result is None:
1320                    print '=== Fails on unicode-sensitive match', t
1321
1322def test_main():
1323    run_unittest(ReTests)
1324    deprecations = [
1325        ('bad escape', DeprecationWarning),
1326    ]
1327    with check_py3k_warnings(*deprecations):
1328        run_re_tests()
1329
1330if __name__ == "__main__":
1331    test_main()
1332