• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1# -*- coding: utf-8 -*-
2from test.test_support import (
3    verbose, run_unittest, import_module,
4    precisionbigmemtest, _2G, cpython_only,
5    captured_stdout, have_unicode, requires_unicode, u,
6    check_warnings)
7import locale
8import re
9from re import Scanner
10import sre_constants
11import sys
12import string
13import traceback
14from weakref import proxy
15
16
17# Misc tests from Tim Peters' re.doc
18
19# WARNING: Don't change details in these tests if you don't know
20# what you're doing. Some of these tests were carefully modeled to
21# cover most of the code.
22
23import unittest
24
25class ReTests(unittest.TestCase):
26
27    def test_weakref(self):
28        s = 'QabbbcR'
29        x = re.compile('ab+c')
30        y = proxy(x)
31        self.assertEqual(x.findall('QabbbcR'), y.findall('QabbbcR'))
32
33    def test_search_star_plus(self):
34        self.assertEqual(re.search('x*', 'axx').span(0), (0, 0))
35        self.assertEqual(re.search('x*', 'axx').span(), (0, 0))
36        self.assertEqual(re.search('x+', 'axx').span(0), (1, 3))
37        self.assertEqual(re.search('x+', 'axx').span(), (1, 3))
38        self.assertIsNone(re.search('x', 'aaa'))
39        self.assertEqual(re.match('a*', 'xxx').span(0), (0, 0))
40        self.assertEqual(re.match('a*', 'xxx').span(), (0, 0))
41        self.assertEqual(re.match('x*', 'xxxa').span(0), (0, 3))
42        self.assertEqual(re.match('x*', 'xxxa').span(), (0, 3))
43        self.assertIsNone(re.match('a+', 'xxx'))
44
45    def bump_num(self, matchobj):
46        int_value = int(matchobj.group(0))
47        return str(int_value + 1)
48
49    def test_basic_re_sub(self):
50        self.assertEqual(re.sub("(?i)b+", "x", "bbbb BBBB"), 'x x')
51        self.assertEqual(re.sub(r'\d+', self.bump_num, '08.2 -2 23x99y'),
52                         '9.3 -3 24x100y')
53        self.assertEqual(re.sub(r'\d+', self.bump_num, '08.2 -2 23x99y', 3),
54                         '9.3 -3 23x99y')
55
56        self.assertEqual(re.sub('.', lambda m: r"\n", 'x'), '\\n')
57        self.assertEqual(re.sub('.', r"\n", 'x'), '\n')
58
59        s = r"\1\1"
60        self.assertEqual(re.sub('(.)', s, 'x'), 'xx')
61        self.assertEqual(re.sub('(.)', re.escape(s), 'x'), s)
62        self.assertEqual(re.sub('(.)', lambda m: s, 'x'), s)
63
64        self.assertEqual(re.sub('(?P<a>x)', '\g<a>\g<a>', 'xx'), 'xxxx')
65        self.assertEqual(re.sub('(?P<a>x)', '\g<a>\g<1>', 'xx'), 'xxxx')
66        self.assertEqual(re.sub('(?P<unk>x)', '\g<unk>\g<unk>', 'xx'), 'xxxx')
67        self.assertEqual(re.sub('(?P<unk>x)', '\g<1>\g<1>', 'xx'), 'xxxx')
68
69        self.assertEqual(re.sub('a',r'\t\n\v\r\f\a\b\B\Z\a\A\w\W\s\S\d\D','a'),
70                         '\t\n\v\r\f\a\b\\B\\Z\a\\A\\w\\W\\s\\S\\d\\D')
71        self.assertEqual(re.sub('a', '\t\n\v\r\f\a', 'a'), '\t\n\v\r\f\a')
72        self.assertEqual(re.sub('a', '\t\n\v\r\f\a', 'a'),
73                         (chr(9)+chr(10)+chr(11)+chr(13)+chr(12)+chr(7)))
74
75        self.assertEqual(re.sub('^\s*', 'X', 'test'), 'Xtest')
76
77    def test_bug_449964(self):
78        # fails for group followed by other escape
79        self.assertEqual(re.sub(r'(?P<unk>x)', '\g<1>\g<1>\\b', 'xx'),
80                         'xx\bxx\b')
81
82    def test_bug_449000(self):
83        # Test for sub() on escaped characters
84        self.assertEqual(re.sub(r'\r\n', r'\n', 'abc\r\ndef\r\n'),
85                         'abc\ndef\n')
86        self.assertEqual(re.sub('\r\n', r'\n', 'abc\r\ndef\r\n'),
87                         'abc\ndef\n')
88        self.assertEqual(re.sub(r'\r\n', '\n', 'abc\r\ndef\r\n'),
89                         'abc\ndef\n')
90        self.assertEqual(re.sub('\r\n', '\n', 'abc\r\ndef\r\n'),
91                         'abc\ndef\n')
92
93    @requires_unicode
94    def test_bug_1140(self):
95        # re.sub(x, y, u'') should return u'', not '', and
96        # re.sub(x, y, '') should return '', not u''.
97        # Also:
98        # re.sub(x, y, unicode(x)) should return unicode(y), and
99        # re.sub(x, y, str(x)) should return
100        #     str(y) if isinstance(y, str) else unicode(y).
101        for x in 'x', u'x':
102            for y in 'y', u'y':
103                z = re.sub(x, y, u'')
104                self.assertEqual(z, u'')
105                self.assertEqual(type(z), unicode)
106                #
107                z = re.sub(x, y, '')
108                self.assertEqual(z, '')
109                self.assertEqual(type(z), str)
110                #
111                z = re.sub(x, y, unicode(x))
112                self.assertEqual(z, y)
113                self.assertEqual(type(z), unicode)
114                #
115                z = re.sub(x, y, str(x))
116                self.assertEqual(z, y)
117                self.assertEqual(type(z), type(y))
118
119    def test_bug_1661(self):
120        # Verify that flags do not get silently ignored with compiled patterns
121        pattern = re.compile('.')
122        self.assertRaises(ValueError, re.match, pattern, 'A', re.I)
123        self.assertRaises(ValueError, re.search, pattern, 'A', re.I)
124        self.assertRaises(ValueError, re.findall, pattern, 'A', re.I)
125        self.assertRaises(ValueError, re.compile, pattern, re.I)
126
127    def test_bug_3629(self):
128        # A regex that triggered a bug in the sre-code validator
129        re.compile("(?P<quote>)(?(quote))")
130
131    def test_sub_template_numeric_escape(self):
132        # bug 776311 and friends
133        self.assertEqual(re.sub('x', r'\0', 'x'), '\0')
134        self.assertEqual(re.sub('x', r'\000', 'x'), '\000')
135        self.assertEqual(re.sub('x', r'\001', 'x'), '\001')
136        self.assertEqual(re.sub('x', r'\008', 'x'), '\0' + '8')
137        self.assertEqual(re.sub('x', r'\009', 'x'), '\0' + '9')
138        self.assertEqual(re.sub('x', r'\111', 'x'), '\111')
139        self.assertEqual(re.sub('x', r'\117', 'x'), '\117')
140
141        self.assertEqual(re.sub('x', r'\1111', 'x'), '\1111')
142        self.assertEqual(re.sub('x', r'\1111', 'x'), '\111' + '1')
143
144        self.assertEqual(re.sub('x', r'\00', 'x'), '\x00')
145        self.assertEqual(re.sub('x', r'\07', 'x'), '\x07')
146        self.assertEqual(re.sub('x', r'\08', 'x'), '\0' + '8')
147        self.assertEqual(re.sub('x', r'\09', 'x'), '\0' + '9')
148        self.assertEqual(re.sub('x', r'\0a', 'x'), '\0' + 'a')
149
150        self.assertEqual(re.sub('x', r'\400', 'x'), '\0')
151        self.assertEqual(re.sub('x', r'\777', 'x'), '\377')
152
153        self.assertRaises(re.error, re.sub, 'x', r'\1', 'x')
154        self.assertRaises(re.error, re.sub, 'x', r'\8', 'x')
155        self.assertRaises(re.error, re.sub, 'x', r'\9', 'x')
156        self.assertRaises(re.error, re.sub, 'x', r'\11', 'x')
157        self.assertRaises(re.error, re.sub, 'x', r'\18', 'x')
158        self.assertRaises(re.error, re.sub, 'x', r'\1a', 'x')
159        self.assertRaises(re.error, re.sub, 'x', r'\90', 'x')
160        self.assertRaises(re.error, re.sub, 'x', r'\99', 'x')
161        self.assertRaises(re.error, re.sub, 'x', r'\118', 'x') # r'\11' + '8'
162        self.assertRaises(re.error, re.sub, 'x', r'\11a', 'x')
163        self.assertRaises(re.error, re.sub, 'x', r'\181', 'x') # r'\18' + '1'
164        self.assertRaises(re.error, re.sub, 'x', r'\800', 'x') # r'\80' + '0'
165
166        # in python2.3 (etc), these loop endlessly in sre_parser.py
167        self.assertEqual(re.sub('(((((((((((x)))))))))))', r'\11', 'x'), 'x')
168        self.assertEqual(re.sub('((((((((((y))))))))))(.)', r'\118', 'xyz'),
169                         'xz8')
170        self.assertEqual(re.sub('((((((((((y))))))))))(.)', r'\11a', 'xyz'),
171                         'xza')
172
173    def test_qualified_re_sub(self):
174        self.assertEqual(re.sub('a', 'b', 'aaaaa'), 'bbbbb')
175        self.assertEqual(re.sub('a', 'b', 'aaaaa', 1), 'baaaa')
176
177    def test_bug_114660(self):
178        self.assertEqual(re.sub(r'(\S)\s+(\S)', r'\1 \2', 'hello  there'),
179                         'hello there')
180
181    def test_bug_462270(self):
182        # Test for empty sub() behaviour, see SF bug #462270
183        self.assertEqual(re.sub('x*', '-', 'abxd'), '-a-b-d-')
184        self.assertEqual(re.sub('x+', '-', 'abxd'), 'ab-d')
185
186    def test_symbolic_groups(self):
187        re.compile('(?P<a>x)(?P=a)(?(a)y)')
188        re.compile('(?P<a1>x)(?P=a1)(?(a1)y)')
189        self.assertRaises(re.error, re.compile, '(?P<a>)(?P<a>)')
190        self.assertRaises(re.error, re.compile, '(?Px)')
191        self.assertRaises(re.error, re.compile, '(?P=)')
192        self.assertRaises(re.error, re.compile, '(?P=1)')
193        self.assertRaises(re.error, re.compile, '(?P=a)')
194        self.assertRaises(re.error, re.compile, '(?P=a1)')
195        self.assertRaises(re.error, re.compile, '(?P=a.)')
196        self.assertRaises(re.error, re.compile, '(?P<)')
197        self.assertRaises(re.error, re.compile, '(?P<>)')
198        self.assertRaises(re.error, re.compile, '(?P<1>)')
199        self.assertRaises(re.error, re.compile, '(?P<a.>)')
200        self.assertRaises(re.error, re.compile, '(?())')
201        self.assertRaises(re.error, re.compile, '(?(a))')
202        self.assertRaises(re.error, re.compile, '(?(1a))')
203        self.assertRaises(re.error, re.compile, '(?(a.))')
204
205    def test_symbolic_refs(self):
206        self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<a', 'xx')
207        self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<', 'xx')
208        self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g', 'xx')
209        self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<a a>', 'xx')
210        self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<>', 'xx')
211        self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<1a1>', 'xx')
212        self.assertRaises(IndexError, re.sub, '(?P<a>x)', '\g<ab>', 'xx')
213        self.assertRaises(re.error, re.sub, '(?P<a>x)|(?P<b>y)', '\g<b>', 'xx')
214        self.assertRaises(re.error, re.sub, '(?P<a>x)|(?P<b>y)', '\\2', 'xx')
215        self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<-1>', 'xx')
216
217    def test_re_subn(self):
218        self.assertEqual(re.subn("(?i)b+", "x", "bbbb BBBB"), ('x x', 2))
219        self.assertEqual(re.subn("b+", "x", "bbbb BBBB"), ('x BBBB', 1))
220        self.assertEqual(re.subn("b+", "x", "xyz"), ('xyz', 0))
221        self.assertEqual(re.subn("b*", "x", "xyz"), ('xxxyxzx', 4))
222        self.assertEqual(re.subn("b*", "x", "xyz", 2), ('xxxyz', 2))
223
224    def test_re_split(self):
225        self.assertEqual(re.split(":", ":a:b::c"), ['', 'a', 'b', '', 'c'])
226        self.assertEqual(re.split(":*", ":a:b::c"), ['', 'a', 'b', 'c'])
227        self.assertEqual(re.split("(:*)", ":a:b::c"),
228                         ['', ':', 'a', ':', 'b', '::', 'c'])
229        self.assertEqual(re.split("(?::*)", ":a:b::c"), ['', 'a', 'b', 'c'])
230        self.assertEqual(re.split("(:)*", ":a:b::c"),
231                         ['', ':', 'a', ':', 'b', ':', 'c'])
232        self.assertEqual(re.split("([b:]+)", ":a:b::c"),
233                         ['', ':', 'a', ':b::', 'c'])
234        self.assertEqual(re.split("(b)|(:+)", ":a:b::c"),
235                         ['', None, ':', 'a', None, ':', '', 'b', None, '',
236                          None, '::', 'c'])
237        self.assertEqual(re.split("(?:b)|(?::+)", ":a:b::c"),
238                         ['', 'a', '', '', 'c'])
239
240    def test_qualified_re_split(self):
241        self.assertEqual(re.split(":", ":a:b::c", 2), ['', 'a', 'b::c'])
242        self.assertEqual(re.split(':', 'a:b:c:d', 2), ['a', 'b', 'c:d'])
243        self.assertEqual(re.split("(:)", ":a:b::c", 2),
244                         ['', ':', 'a', ':', 'b::c'])
245        self.assertEqual(re.split("(:*)", ":a:b::c", 2),
246                         ['', ':', 'a', ':', 'b::c'])
247
248    def test_re_findall(self):
249        self.assertEqual(re.findall(":+", "abc"), [])
250        self.assertEqual(re.findall(":+", "a:b::c:::d"), [":", "::", ":::"])
251        self.assertEqual(re.findall("(:+)", "a:b::c:::d"), [":", "::", ":::"])
252        self.assertEqual(re.findall("(:)(:*)", "a:b::c:::d"), [(":", ""),
253                                                               (":", ":"),
254                                                               (":", "::")])
255
256    def test_bug_117612(self):
257        self.assertEqual(re.findall(r"(a|(b))", "aba"),
258                         [("a", ""),("b", "b"),("a", "")])
259
260    def test_re_match(self):
261        self.assertEqual(re.match('a', 'a').groups(), ())
262        self.assertEqual(re.match('(a)', 'a').groups(), ('a',))
263        self.assertEqual(re.match(r'(a)', 'a').group(0), 'a')
264        self.assertEqual(re.match(r'(a)', 'a').group(1), 'a')
265        self.assertEqual(re.match(r'(a)', 'a').group(1, 1), ('a', 'a'))
266
267        pat = re.compile('((a)|(b))(c)?')
268        self.assertEqual(pat.match('a').groups(), ('a', 'a', None, None))
269        self.assertEqual(pat.match('b').groups(), ('b', None, 'b', None))
270        self.assertEqual(pat.match('ac').groups(), ('a', 'a', None, 'c'))
271        self.assertEqual(pat.match('bc').groups(), ('b', None, 'b', 'c'))
272        self.assertEqual(pat.match('bc').groups(""), ('b', "", 'b', 'c'))
273
274        # A single group
275        m = re.match('(a)', 'a')
276        self.assertEqual(m.group(0), 'a')
277        self.assertEqual(m.group(0), 'a')
278        self.assertEqual(m.group(1), 'a')
279        self.assertEqual(m.group(1, 1), ('a', 'a'))
280
281        pat = re.compile('(?:(?P<a1>a)|(?P<b2>b))(?P<c3>c)?')
282        self.assertEqual(pat.match('a').group(1, 2, 3), ('a', None, None))
283        self.assertEqual(pat.match('b').group('a1', 'b2', 'c3'),
284                         (None, 'b', None))
285        self.assertEqual(pat.match('ac').group(1, 'b2', 3), ('a', None, 'c'))
286
287    def test_re_groupref_exists(self):
288        self.assertEqual(re.match('^(\()?([^()]+)(?(1)\))$', '(a)').groups(),
289                         ('(', 'a'))
290        self.assertEqual(re.match('^(\()?([^()]+)(?(1)\))$', 'a').groups(),
291                         (None, 'a'))
292        self.assertIsNone(re.match('^(\()?([^()]+)(?(1)\))$', 'a)'))
293        self.assertIsNone(re.match('^(\()?([^()]+)(?(1)\))$', '(a'))
294        self.assertEqual(re.match('^(?:(a)|c)((?(1)b|d))$', 'ab').groups(),
295                         ('a', 'b'))
296        self.assertEqual(re.match('^(?:(a)|c)((?(1)b|d))$', 'cd').groups(),
297                         (None, 'd'))
298        self.assertEqual(re.match('^(?:(a)|c)((?(1)|d))$', 'cd').groups(),
299                         (None, 'd'))
300        self.assertEqual(re.match('^(?:(a)|c)((?(1)|d))$', 'a').groups(),
301                         ('a', ''))
302
303        # Tests for bug #1177831: exercise groups other than the first group
304        p = re.compile('(?P<g1>a)(?P<g2>b)?((?(g2)c|d))')
305        self.assertEqual(p.match('abc').groups(),
306                         ('a', 'b', 'c'))
307        self.assertEqual(p.match('ad').groups(),
308                         ('a', None, 'd'))
309        self.assertIsNone(p.match('abd'))
310        self.assertIsNone(p.match('ac'))
311
312
313    def test_re_groupref(self):
314        self.assertEqual(re.match(r'^(\|)?([^()]+)\1$', '|a|').groups(),
315                         ('|', 'a'))
316        self.assertEqual(re.match(r'^(\|)?([^()]+)\1?$', 'a').groups(),
317                         (None, 'a'))
318        self.assertIsNone(re.match(r'^(\|)?([^()]+)\1$', 'a|'))
319        self.assertIsNone(re.match(r'^(\|)?([^()]+)\1$', '|a'))
320        self.assertEqual(re.match(r'^(?:(a)|c)(\1)$', 'aa').groups(),
321                         ('a', 'a'))
322        self.assertEqual(re.match(r'^(?:(a)|c)(\1)?$', 'c').groups(),
323                         (None, None))
324
325    def test_groupdict(self):
326        self.assertEqual(re.match('(?P<first>first) (?P<second>second)',
327                                  'first second').groupdict(),
328                         {'first':'first', 'second':'second'})
329
330    def test_expand(self):
331        self.assertEqual(re.match("(?P<first>first) (?P<second>second)",
332                                  "first second")
333                                  .expand(r"\2 \1 \g<second> \g<first>"),
334                         "second first second first")
335
336    def test_repeat_minmax(self):
337        self.assertIsNone(re.match("^(\w){1}$", "abc"))
338        self.assertIsNone(re.match("^(\w){1}?$", "abc"))
339        self.assertIsNone(re.match("^(\w){1,2}$", "abc"))
340        self.assertIsNone(re.match("^(\w){1,2}?$", "abc"))
341
342        self.assertEqual(re.match("^(\w){3}$", "abc").group(1), "c")
343        self.assertEqual(re.match("^(\w){1,3}$", "abc").group(1), "c")
344        self.assertEqual(re.match("^(\w){1,4}$", "abc").group(1), "c")
345        self.assertEqual(re.match("^(\w){3,4}?$", "abc").group(1), "c")
346        self.assertEqual(re.match("^(\w){3}?$", "abc").group(1), "c")
347        self.assertEqual(re.match("^(\w){1,3}?$", "abc").group(1), "c")
348        self.assertEqual(re.match("^(\w){1,4}?$", "abc").group(1), "c")
349        self.assertEqual(re.match("^(\w){3,4}?$", "abc").group(1), "c")
350
351        self.assertIsNone(re.match("^x{1}$", "xxx"))
352        self.assertIsNone(re.match("^x{1}?$", "xxx"))
353        self.assertIsNone(re.match("^x{1,2}$", "xxx"))
354        self.assertIsNone(re.match("^x{1,2}?$", "xxx"))
355
356        self.assertTrue(re.match("^x{3}$", "xxx"))
357        self.assertTrue(re.match("^x{1,3}$", "xxx"))
358        self.assertTrue(re.match("^x{1,4}$", "xxx"))
359        self.assertTrue(re.match("^x{3,4}?$", "xxx"))
360        self.assertTrue(re.match("^x{3}?$", "xxx"))
361        self.assertTrue(re.match("^x{1,3}?$", "xxx"))
362        self.assertTrue(re.match("^x{1,4}?$", "xxx"))
363        self.assertTrue(re.match("^x{3,4}?$", "xxx"))
364
365        self.assertIsNone(re.match("^x{}$", "xxx"))
366        self.assertTrue(re.match("^x{}$", "x{}"))
367
368    def test_getattr(self):
369        self.assertEqual(re.match("(a)", "a").pos, 0)
370        self.assertEqual(re.match("(a)", "a").endpos, 1)
371        self.assertEqual(re.match("(a)", "a").string, "a")
372        self.assertEqual(re.match("(a)", "a").regs, ((0, 1), (0, 1)))
373        self.assertTrue(re.match("(a)", "a").re)
374
375    def test_special_escapes(self):
376        self.assertEqual(re.search(r"\b(b.)\b",
377                                   "abcd abc bcd bx").group(1), "bx")
378        self.assertEqual(re.search(r"\B(b.)\B",
379                                   "abc bcd bc abxd").group(1), "bx")
380        self.assertEqual(re.search(r"\b(b.)\b",
381                                   "abcd abc bcd bx", re.LOCALE).group(1), "bx")
382        self.assertEqual(re.search(r"\B(b.)\B",
383                                   "abc bcd bc abxd", re.LOCALE).group(1), "bx")
384        if have_unicode:
385            self.assertEqual(re.search(r"\b(b.)\b",
386                                       "abcd abc bcd bx", re.UNICODE).group(1), "bx")
387            self.assertEqual(re.search(r"\B(b.)\B",
388                                       "abc bcd bc abxd", re.UNICODE).group(1), "bx")
389        self.assertEqual(re.search(r"^abc$", "\nabc\n", re.M).group(0), "abc")
390        self.assertEqual(re.search(r"^\Aabc\Z$", "abc", re.M).group(0), "abc")
391        self.assertIsNone(re.search(r"^\Aabc\Z$", "\nabc\n", re.M))
392        self.assertEqual(re.search(r"\b(b.)\b",
393                                   u"abcd abc bcd bx").group(1), "bx")
394        self.assertEqual(re.search(r"\B(b.)\B",
395                                   u"abc bcd bc abxd").group(1), "bx")
396        self.assertEqual(re.search(r"^abc$", u"\nabc\n", re.M).group(0), "abc")
397        self.assertEqual(re.search(r"^\Aabc\Z$", u"abc", re.M).group(0), "abc")
398        self.assertIsNone(re.search(r"^\Aabc\Z$", u"\nabc\n", re.M))
399        self.assertEqual(re.search(r"\d\D\w\W\s\S",
400                                   "1aa! a").group(0), "1aa! a")
401        self.assertEqual(re.search(r"\d\D\w\W\s\S",
402                                   "1aa! a", re.LOCALE).group(0), "1aa! a")
403        if have_unicode:
404            self.assertEqual(re.search(r"\d\D\w\W\s\S",
405                                       "1aa! a", re.UNICODE).group(0), "1aa! a")
406
407    def test_string_boundaries(self):
408        # See http://bugs.python.org/issue10713
409        self.assertEqual(re.search(r"\b(abc)\b", "abc").group(1),
410                         "abc")
411        # There's a word boundary at the start of a string.
412        self.assertTrue(re.match(r"\b", "abc"))
413        # A non-empty string includes a non-boundary zero-length match.
414        self.assertTrue(re.search(r"\B", "abc"))
415        # There is no non-boundary match at the start of a string.
416        self.assertFalse(re.match(r"\B", "abc"))
417        # However, an empty string contains no word boundaries, and also no
418        # non-boundaries.
419        self.assertIsNone(re.search(r"\B", ""))
420        # This one is questionable and different from the perlre behaviour,
421        # but describes current behavior.
422        self.assertIsNone(re.search(r"\b", ""))
423        # A single word-character string has two boundaries, but no
424        # non-boundary gaps.
425        self.assertEqual(len(re.findall(r"\b", "a")), 2)
426        self.assertEqual(len(re.findall(r"\B", "a")), 0)
427        # If there are no words, there are no boundaries
428        self.assertEqual(len(re.findall(r"\b", " ")), 0)
429        self.assertEqual(len(re.findall(r"\b", "   ")), 0)
430        # Can match around the whitespace.
431        self.assertEqual(len(re.findall(r"\B", " ")), 2)
432
433    @requires_unicode
434    def test_bigcharset(self):
435        self.assertEqual(re.match(u(r"([\u2222\u2223])"),
436                                  unichr(0x2222)).group(1), unichr(0x2222))
437        self.assertEqual(re.match(u(r"([\u2222\u2223])"),
438                                  unichr(0x2222), re.UNICODE).group(1), unichr(0x2222))
439        r = u'[%s]' % u''.join(map(unichr, range(256, 2**16, 255)))
440        self.assertEqual(re.match(r, unichr(0xff01), re.UNICODE).group(), unichr(0xff01))
441
442    def test_big_codesize(self):
443        # Issue #1160
444        r = re.compile('|'.join(('%d'%x for x in range(10000))))
445        self.assertTrue(r.match('1000'))
446        self.assertTrue(r.match('9999'))
447
448    def test_anyall(self):
449        self.assertEqual(re.match("a.b", "a\nb", re.DOTALL).group(0),
450                         "a\nb")
451        self.assertEqual(re.match("a.*b", "a\n\nb", re.DOTALL).group(0),
452                         "a\n\nb")
453
454    def test_lookahead(self):
455        self.assertEqual(re.match("(a(?=\s[^a]))", "a b").group(1), "a")
456        self.assertEqual(re.match("(a(?=\s[^a]*))", "a b").group(1), "a")
457        self.assertEqual(re.match("(a(?=\s[abc]))", "a b").group(1), "a")
458        self.assertEqual(re.match("(a(?=\s[abc]*))", "a bc").group(1), "a")
459        self.assertEqual(re.match(r"(a)(?=\s\1)", "a a").group(1), "a")
460        self.assertEqual(re.match(r"(a)(?=\s\1*)", "a aa").group(1), "a")
461        self.assertEqual(re.match(r"(a)(?=\s(abc|a))", "a a").group(1), "a")
462
463        self.assertEqual(re.match(r"(a(?!\s[^a]))", "a a").group(1), "a")
464        self.assertEqual(re.match(r"(a(?!\s[abc]))", "a d").group(1), "a")
465        self.assertEqual(re.match(r"(a)(?!\s\1)", "a b").group(1), "a")
466        self.assertEqual(re.match(r"(a)(?!\s(abc|a))", "a b").group(1), "a")
467
468        # Group reference.
469        self.assertTrue(re.match(r'(a)b(?=\1)a', 'aba'))
470        self.assertIsNone(re.match(r'(a)b(?=\1)c', 'abac'))
471        # Named group reference.
472        self.assertTrue(re.match(r'(?P<g>a)b(?=(?P=g))a', 'aba'))
473        self.assertIsNone(re.match(r'(?P<g>a)b(?=(?P=g))c', 'abac'))
474        # Conditional group reference.
475        self.assertTrue(re.match(r'(?:(a)|(x))b(?=(?(2)x|c))c', 'abc'))
476        self.assertIsNone(re.match(r'(?:(a)|(x))b(?=(?(2)c|x))c', 'abc'))
477        self.assertTrue(re.match(r'(?:(a)|(x))b(?=(?(2)x|c))c', 'abc'))
478        self.assertIsNone(re.match(r'(?:(a)|(x))b(?=(?(1)b|x))c', 'abc'))
479        self.assertTrue(re.match(r'(?:(a)|(x))b(?=(?(1)c|x))c', 'abc'))
480        # Group used before defined.
481        self.assertTrue(re.match(r'(a)b(?=(?(2)x|c))(c)', 'abc'))
482        self.assertIsNone(re.match(r'(a)b(?=(?(2)b|x))(c)', 'abc'))
483        self.assertTrue(re.match(r'(a)b(?=(?(1)c|x))(c)', 'abc'))
484
485    def test_lookbehind(self):
486        self.assertTrue(re.match(r'ab(?<=b)c', 'abc'))
487        self.assertIsNone(re.match(r'ab(?<=c)c', 'abc'))
488        self.assertIsNone(re.match(r'ab(?<!b)c', 'abc'))
489        self.assertTrue(re.match(r'ab(?<!c)c', 'abc'))
490        # Group reference.
491        with check_warnings(('', RuntimeWarning)):
492            re.compile(r'(a)a(?<=\1)c')
493        # Named group reference.
494        with check_warnings(('', RuntimeWarning)):
495            re.compile(r'(?P<g>a)a(?<=(?P=g))c')
496        # Conditional group reference.
497        with check_warnings(('', RuntimeWarning)):
498            re.compile(r'(a)b(?<=(?(1)b|x))c')
499        # Group used before defined.
500        with check_warnings(('', RuntimeWarning)):
501            re.compile(r'(a)b(?<=(?(2)b|x))(c)')
502
503    def test_ignore_case(self):
504        self.assertEqual(re.match("abc", "ABC", re.I).group(0), "ABC")
505        self.assertEqual(re.match("abc", u"ABC", re.I).group(0), "ABC")
506        self.assertEqual(re.match(r"(a\s[^a])", "a b", re.I).group(1), "a b")
507        self.assertEqual(re.match(r"(a\s[^a]*)", "a bb", re.I).group(1), "a bb")
508        self.assertEqual(re.match(r"(a\s[abc])", "a b", re.I).group(1), "a b")
509        self.assertEqual(re.match(r"(a\s[abc]*)", "a bb", re.I).group(1), "a bb")
510        self.assertEqual(re.match(r"((a)\s\2)", "a a", re.I).group(1), "a a")
511        self.assertEqual(re.match(r"((a)\s\2*)", "a aa", re.I).group(1), "a aa")
512        self.assertEqual(re.match(r"((a)\s(abc|a))", "a a", re.I).group(1), "a a")
513        self.assertEqual(re.match(r"((a)\s(abc|a)*)", "a aa", re.I).group(1), "a aa")
514
515        if have_unicode:
516            assert u(r'\u212a').lower() == u'k' # 'K'
517            self.assertTrue(re.match(ur'K', u(r'\u212a'), re.U | re.I))
518            self.assertTrue(re.match(ur'k', u(r'\u212a'), re.U | re.I))
519            self.assertTrue(re.match(u(r'\u212a'), u'K', re.U | re.I))
520            self.assertTrue(re.match(u(r'\u212a'), u'k', re.U | re.I))
521            assert u(r'\u017f').upper() == u'S' # 'ſ'
522            self.assertTrue(re.match(ur'S', u(r'\u017f'), re.U | re.I))
523            self.assertTrue(re.match(ur's', u(r'\u017f'), re.U | re.I))
524            self.assertTrue(re.match(u(r'\u017f'), u'S', re.U | re.I))
525            self.assertTrue(re.match(u(r'\u017f'), u's', re.U | re.I))
526
527    def test_ignore_case_set(self):
528        self.assertTrue(re.match(r'[19A]', 'A', re.I))
529        self.assertTrue(re.match(r'[19a]', 'a', re.I))
530        self.assertTrue(re.match(r'[19a]', 'A', re.I))
531        self.assertTrue(re.match(r'[19A]', 'a', re.I))
532        if have_unicode:
533            self.assertTrue(re.match(ur'[19A]', u'A', re.U | re.I))
534            self.assertTrue(re.match(ur'[19a]', u'a', re.U | re.I))
535            self.assertTrue(re.match(ur'[19a]', u'A', re.U | re.I))
536            self.assertTrue(re.match(ur'[19A]', u'a', re.U | re.I))
537            assert u(r'\u212a').lower() == u'k' # 'K'
538            self.assertTrue(re.match(u(r'[19K]'), u(r'\u212a'), re.U | re.I))
539            self.assertTrue(re.match(u(r'[19k]'), u(r'\u212a'), re.U | re.I))
540            self.assertTrue(re.match(u(r'[19\u212a]'), u'K', re.U | re.I))
541            self.assertTrue(re.match(u(r'[19\u212a]'), u'k', re.U | re.I))
542            assert u(r'\u017f').upper() == u'S' # 'ſ'
543            self.assertTrue(re.match(ur'[19S]', u(r'\u017f'), re.U | re.I))
544            self.assertTrue(re.match(ur'[19s]', u(r'\u017f'), re.U | re.I))
545            self.assertTrue(re.match(u(r'[19\u017f]'), u'S', re.U | re.I))
546            self.assertTrue(re.match(u(r'[19\u017f]'), u's', re.U | re.I))
547
548    def test_ignore_case_range(self):
549        # Issues #3511, #17381.
550        self.assertTrue(re.match(r'[9-a]', '_', re.I))
551        self.assertIsNone(re.match(r'[9-A]', '_', re.I))
552        self.assertTrue(re.match(r'[\xc0-\xde]', '\xd7', re.I))
553        self.assertIsNone(re.match(r'[\xc0-\xde]', '\xf7', re.I))
554        self.assertTrue(re.match(r'[\xe0-\xfe]', '\xf7',re.I))
555        self.assertIsNone(re.match(r'[\xe0-\xfe]', '\xd7', re.I))
556        if have_unicode:
557            self.assertTrue(re.match(u(r'[9-a]'), u(r'_'), re.U | re.I))
558            self.assertIsNone(re.match(u(r'[9-A]'), u(r'_'), re.U | re.I))
559            self.assertTrue(re.match(u(r'[\xc0-\xde]'),
560                                     u(r'\xd7'), re.U | re.I))
561            self.assertIsNone(re.match(u(r'[\xc0-\xde]'),
562                                       u(r'\xf7'), re.U | re.I))
563            self.assertTrue(re.match(u(r'[\xe0-\xfe]'),
564                                     u(r'\xf7'), re.U | re.I))
565            self.assertIsNone(re.match(u(r'[\xe0-\xfe]'),
566                                       u(r'\xd7'), re.U | re.I))
567            self.assertTrue(re.match(u(r'[\u0430-\u045f]'),
568                                     u(r'\u0450'), re.U | re.I))
569            self.assertTrue(re.match(u(r'[\u0430-\u045f]'),
570                                     u(r'\u0400'), re.U | re.I))
571            self.assertTrue(re.match(u(r'[\u0400-\u042f]'),
572                                     u(r'\u0450'), re.U | re.I))
573            self.assertTrue(re.match(u(r'[\u0400-\u042f]'),
574                                     u(r'\u0400'), re.U | re.I))
575            if sys.maxunicode > 0xffff:
576                self.assertTrue(re.match(u(r'[\U00010428-\U0001044f]'),
577                                         u(r'\U00010428'), re.U | re.I))
578                self.assertTrue(re.match(u(r'[\U00010428-\U0001044f]'),
579                                         u(r'\U00010400'), re.U | re.I))
580                self.assertTrue(re.match(u(r'[\U00010400-\U00010427]'),
581                                         u(r'\U00010428'), re.U | re.I))
582                self.assertTrue(re.match(u(r'[\U00010400-\U00010427]'),
583                                         u(r'\U00010400'), re.U | re.I))
584
585            assert u(r'\u212a').lower() == u'k' # 'K'
586            self.assertTrue(re.match(ur'[J-M]', u(r'\u212a'), re.U | re.I))
587            self.assertTrue(re.match(ur'[j-m]', u(r'\u212a'), re.U | re.I))
588            self.assertTrue(re.match(u(r'[\u2129-\u212b]'), u'K', re.U | re.I))
589            self.assertTrue(re.match(u(r'[\u2129-\u212b]'), u'k', re.U | re.I))
590            assert u(r'\u017f').upper() == u'S' # 'ſ'
591            self.assertTrue(re.match(ur'[R-T]', u(r'\u017f'), re.U | re.I))
592            self.assertTrue(re.match(ur'[r-t]', u(r'\u017f'), re.U | re.I))
593            self.assertTrue(re.match(u(r'[\u017e-\u0180]'), u'S', re.U | re.I))
594            self.assertTrue(re.match(u(r'[\u017e-\u0180]'), u's', re.U | re.I))
595
596    def test_category(self):
597        self.assertEqual(re.match(r"(\s)", " ").group(1), " ")
598
599    def test_getlower(self):
600        import _sre
601        self.assertEqual(_sre.getlower(ord('A'), 0), ord('a'))
602        self.assertEqual(_sre.getlower(ord('A'), re.LOCALE), ord('a'))
603        if have_unicode:
604            self.assertEqual(_sre.getlower(ord('A'), re.UNICODE), ord('a'))
605
606        self.assertEqual(re.match("abc", "ABC", re.I).group(0), "ABC")
607        self.assertEqual(re.match("abc", u"ABC", re.I).group(0), "ABC")
608
609    def test_not_literal(self):
610        self.assertEqual(re.search("\s([^a])", " b").group(1), "b")
611        self.assertEqual(re.search("\s([^a]*)", " bb").group(1), "bb")
612
613    def test_search_coverage(self):
614        self.assertEqual(re.search("\s(b)", " b").group(1), "b")
615        self.assertEqual(re.search("a\s", "a ").group(0), "a ")
616
617    def assertMatch(self, pattern, text, match=None, span=None,
618                    matcher=re.match):
619        if match is None and span is None:
620            # the pattern matches the whole text
621            match = text
622            span = (0, len(text))
623        elif match is None or span is None:
624            raise ValueError('If match is not None, span should be specified '
625                             '(and vice versa).')
626        m = matcher(pattern, text)
627        self.assertTrue(m)
628        self.assertEqual(m.group(), match)
629        self.assertEqual(m.span(), span)
630
631    @requires_unicode
632    def test_re_escape(self):
633        alnum_chars = unicode(string.ascii_letters + string.digits)
634        p = u''.join(unichr(i) for i in range(256))
635        for c in p:
636            if c in alnum_chars:
637                self.assertEqual(re.escape(c), c)
638            elif c == u'\x00':
639                self.assertEqual(re.escape(c), u'\\000')
640            else:
641                self.assertEqual(re.escape(c), u'\\' + c)
642            self.assertMatch(re.escape(c), c)
643        self.assertMatch(re.escape(p), p)
644
645    def test_re_escape_byte(self):
646        alnum_chars = string.ascii_letters + string.digits
647        p = ''.join(chr(i) for i in range(256))
648        for b in p:
649            if b in alnum_chars:
650                self.assertEqual(re.escape(b), b)
651            elif b == b'\x00':
652                self.assertEqual(re.escape(b), b'\\000')
653            else:
654                self.assertEqual(re.escape(b), b'\\' + b)
655            self.assertMatch(re.escape(b), b)
656        self.assertMatch(re.escape(p), p)
657
658    @requires_unicode
659    def test_re_escape_non_ascii(self):
660        s = u(r'xxx\u2620\u2620\u2620xxx')
661        s_escaped = re.escape(s)
662        self.assertEqual(s_escaped, u(r'xxx\\\u2620\\\u2620\\\u2620xxx'))
663        self.assertMatch(s_escaped, s)
664        self.assertMatch(u'.%s+.' % re.escape(unichr(0x2620)), s,
665                         u(r'x\u2620\u2620\u2620x'), (2, 7), re.search)
666
667    def test_re_escape_non_ascii_bytes(self):
668        b = b'y\xe2\x98\xa0y\xe2\x98\xa0y'
669        b_escaped = re.escape(b)
670        self.assertEqual(b_escaped, b'y\\\xe2\\\x98\\\xa0y\\\xe2\\\x98\\\xa0y')
671        self.assertMatch(b_escaped, b)
672        res = re.findall(re.escape(b'\xe2\x98\xa0'), b)
673        self.assertEqual(len(res), 2)
674
675    def test_pickling(self):
676        import pickle
677        self.pickle_test(pickle)
678        import cPickle
679        self.pickle_test(cPickle)
680        # old pickles expect the _compile() reconstructor in sre module
681        import_module("sre", deprecated=True)
682        from sre import _compile
683        # current pickle expects the _compile() reconstructor in re module
684        from re import _compile
685
686    def pickle_test(self, pickle):
687        oldpat = re.compile('a(?:b|(c|e){1,2}?|d)+?(.)')
688        for proto in range(pickle.HIGHEST_PROTOCOL + 1):
689            pickled = pickle.dumps(oldpat, proto)
690            newpat = pickle.loads(pickled)
691            self.assertEqual(newpat, oldpat)
692
693    def test_constants(self):
694        self.assertEqual(re.I, re.IGNORECASE)
695        self.assertEqual(re.L, re.LOCALE)
696        self.assertEqual(re.M, re.MULTILINE)
697        self.assertEqual(re.S, re.DOTALL)
698        self.assertEqual(re.X, re.VERBOSE)
699
700    def test_flags(self):
701        for flag in [re.I, re.M, re.X, re.S, re.L]:
702            self.assertTrue(re.compile('^pattern$', flag))
703
704    def test_sre_character_literals(self):
705        for i in [0, 8, 16, 32, 64, 127, 128, 255]:
706            self.assertTrue(re.match(r"\%03o" % i, chr(i)))
707            self.assertTrue(re.match(r"\%03o0" % i, chr(i)+"0"))
708            self.assertTrue(re.match(r"\%03o8" % i, chr(i)+"8"))
709            self.assertTrue(re.match(r"\x%02x" % i, chr(i)))
710            self.assertTrue(re.match(r"\x%02x0" % i, chr(i)+"0"))
711            self.assertTrue(re.match(r"\x%02xz" % i, chr(i)+"z"))
712        self.assertRaises(re.error, re.match, "\911", "")
713
714    def test_sre_character_class_literals(self):
715        for i in [0, 8, 16, 32, 64, 127, 128, 255]:
716            self.assertTrue(re.match(r"[\%03o]" % i, chr(i)))
717            self.assertTrue(re.match(r"[\%03o0]" % i, chr(i)))
718            self.assertTrue(re.match(r"[\%03o8]" % i, chr(i)))
719            self.assertTrue(re.match(r"[\x%02x]" % i, chr(i)))
720            self.assertTrue(re.match(r"[\x%02x0]" % i, chr(i)))
721            self.assertTrue(re.match(r"[\x%02xz]" % i, chr(i)))
722        self.assertRaises(re.error, re.match, "[\911]", "")
723
724    def test_bug_113254(self):
725        self.assertEqual(re.match(r'(a)|(b)', 'b').start(1), -1)
726        self.assertEqual(re.match(r'(a)|(b)', 'b').end(1), -1)
727        self.assertEqual(re.match(r'(a)|(b)', 'b').span(1), (-1, -1))
728
729    def test_bug_527371(self):
730        # bug described in patches 527371/672491
731        self.assertIsNone(re.match(r'(a)?a','a').lastindex)
732        self.assertEqual(re.match(r'(a)(b)?b','ab').lastindex, 1)
733        self.assertEqual(re.match(r'(?P<a>a)(?P<b>b)?b','ab').lastgroup, 'a')
734        self.assertEqual(re.match("(?P<a>a(b))", "ab").lastgroup, 'a')
735        self.assertEqual(re.match("((a))", "a").lastindex, 1)
736
737    def test_bug_545855(self):
738        # bug 545855 -- This pattern failed to cause a compile error as it
739        # should, instead provoking a TypeError.
740        self.assertRaises(re.error, re.compile, 'foo[a-')
741
742    def test_bug_418626(self):
743        # bugs 418626 at al. -- Testing Greg Chapman's addition of op code
744        # SRE_OP_MIN_REPEAT_ONE for eliminating recursion on simple uses of
745        # pattern '*?' on a long string.
746        self.assertEqual(re.match('.*?c', 10000*'ab'+'cd').end(0), 20001)
747        self.assertEqual(re.match('.*?cd', 5000*'ab'+'c'+5000*'ab'+'cde').end(0),
748                         20003)
749        self.assertEqual(re.match('.*?cd', 20000*'abc'+'de').end(0), 60001)
750        # non-simple '*?' still used to hit the recursion limit, before the
751        # non-recursive scheme was implemented.
752        self.assertEqual(re.search('(a|b)*?c', 10000*'ab'+'cd').end(0), 20001)
753
754    @requires_unicode
755    def test_bug_612074(self):
756        pat=u"["+re.escape(unichr(0x2039))+u"]"
757        self.assertEqual(re.compile(pat) and 1, 1)
758
759    def test_stack_overflow(self):
760        # nasty cases that used to overflow the straightforward recursive
761        # implementation of repeated groups.
762        self.assertEqual(re.match('(x)*', 50000*'x').group(1), 'x')
763        self.assertEqual(re.match('(x)*y', 50000*'x'+'y').group(1), 'x')
764        self.assertEqual(re.match('(x)*?y', 50000*'x'+'y').group(1), 'x')
765
766    def test_unlimited_zero_width_repeat(self):
767        # Issue #9669
768        self.assertIsNone(re.match(r'(?:a?)*y', 'z'))
769        self.assertIsNone(re.match(r'(?:a?)+y', 'z'))
770        self.assertIsNone(re.match(r'(?:a?){2,}y', 'z'))
771        self.assertIsNone(re.match(r'(?:a?)*?y', 'z'))
772        self.assertIsNone(re.match(r'(?:a?)+?y', 'z'))
773        self.assertIsNone(re.match(r'(?:a?){2,}?y', 'z'))
774
775    def test_scanner(self):
776        def s_ident(scanner, token): return token
777        def s_operator(scanner, token): return "op%s" % token
778        def s_float(scanner, token): return float(token)
779        def s_int(scanner, token): return int(token)
780
781        scanner = Scanner([
782            (r"[a-zA-Z_]\w*", s_ident),
783            (r"\d+\.\d*", s_float),
784            (r"\d+", s_int),
785            (r"=|\+|-|\*|/", s_operator),
786            (r"\s+", None),
787            ])
788
789        self.assertTrue(scanner.scanner.scanner("").pattern)
790
791        self.assertEqual(scanner.scan("sum = 3*foo + 312.50 + bar"),
792                         (['sum', 'op=', 3, 'op*', 'foo', 'op+', 312.5,
793                           'op+', 'bar'], ''))
794
795    def test_bug_448951(self):
796        # bug 448951 (similar to 429357, but with single char match)
797        # (Also test greedy matches.)
798        for op in '','?','*':
799            self.assertEqual(re.match(r'((.%s):)?z'%op, 'z').groups(),
800                             (None, None))
801            self.assertEqual(re.match(r'((.%s):)?z'%op, 'a:z').groups(),
802                             ('a:', 'a'))
803
804    def test_bug_725106(self):
805        # capturing groups in alternatives in repeats
806        self.assertEqual(re.match('^((a)|b)*', 'abc').groups(),
807                         ('b', 'a'))
808        self.assertEqual(re.match('^(([ab])|c)*', 'abc').groups(),
809                         ('c', 'b'))
810        self.assertEqual(re.match('^((d)|[ab])*', 'abc').groups(),
811                         ('b', None))
812        self.assertEqual(re.match('^((a)c|[ab])*', 'abc').groups(),
813                         ('b', None))
814        self.assertEqual(re.match('^((a)|b)*?c', 'abc').groups(),
815                         ('b', 'a'))
816        self.assertEqual(re.match('^(([ab])|c)*?d', 'abcd').groups(),
817                         ('c', 'b'))
818        self.assertEqual(re.match('^((d)|[ab])*?c', 'abc').groups(),
819                         ('b', None))
820        self.assertEqual(re.match('^((a)c|[ab])*?c', 'abc').groups(),
821                         ('b', None))
822
823    def test_bug_725149(self):
824        # mark_stack_base restoring before restoring marks
825        self.assertEqual(re.match('(a)(?:(?=(b)*)c)*', 'abb').groups(),
826                         ('a', None))
827        self.assertEqual(re.match('(a)((?!(b)*))*', 'abb').groups(),
828                         ('a', None, None))
829
830    @requires_unicode
831    def test_bug_764548(self):
832        # bug 764548, re.compile() barfs on str/unicode subclasses
833        class my_unicode(unicode): pass
834        pat = re.compile(my_unicode("abc"))
835        self.assertIsNone(pat.match("xyz"))
836
837    def test_finditer(self):
838        iter = re.finditer(r":+", "a:b::c:::d")
839        self.assertEqual([item.group(0) for item in iter],
840                         [":", "::", ":::"])
841
842    @requires_unicode
843    def test_bug_926075(self):
844        self.assertIsNot(re.compile('bug_926075'),
845                         re.compile(u'bug_926075'))
846
847    @requires_unicode
848    def test_bug_931848(self):
849        pattern = u(r"[\u002E\u3002\uFF0E\uFF61]")
850        self.assertEqual(re.compile(pattern).split("a.b.c"),
851                         ['a','b','c'])
852
853    def test_bug_581080(self):
854        iter = re.finditer(r"\s", "a b")
855        self.assertEqual(iter.next().span(), (1,2))
856        self.assertRaises(StopIteration, iter.next)
857
858        scanner = re.compile(r"\s").scanner("a b")
859        self.assertEqual(scanner.search().span(), (1, 2))
860        self.assertIsNone(scanner.search())
861
862    def test_bug_817234(self):
863        iter = re.finditer(r".*", "asdf")
864        self.assertEqual(iter.next().span(), (0, 4))
865        self.assertEqual(iter.next().span(), (4, 4))
866        self.assertRaises(StopIteration, iter.next)
867
868    @requires_unicode
869    def test_bug_6561(self):
870        # '\d' should match characters in Unicode category 'Nd'
871        # (Number, Decimal Digit), but not those in 'Nl' (Number,
872        # Letter) or 'No' (Number, Other).
873        decimal_digits = [
874            unichr(0x0037), # '\N{DIGIT SEVEN}', category 'Nd'
875            unichr(0x0e58), # '\N{THAI DIGIT SIX}', category 'Nd'
876            unichr(0xff10), # '\N{FULLWIDTH DIGIT ZERO}', category 'Nd'
877            ]
878        for x in decimal_digits:
879            self.assertEqual(re.match('^\d$', x, re.UNICODE).group(0), x)
880
881        not_decimal_digits = [
882            unichr(0x2165), # '\N{ROMAN NUMERAL SIX}', category 'Nl'
883            unichr(0x3039), # '\N{HANGZHOU NUMERAL TWENTY}', category 'Nl'
884            unichr(0x2082), # '\N{SUBSCRIPT TWO}', category 'No'
885            unichr(0x32b4), # '\N{CIRCLED NUMBER THIRTY NINE}', category 'No'
886            ]
887        for x in not_decimal_digits:
888            self.assertIsNone(re.match('^\d$', x, re.UNICODE))
889
890    def test_empty_array(self):
891        # SF buf 1647541
892        import array
893        typecodes = 'cbBhHiIlLfd'
894        if have_unicode:
895            typecodes += 'u'
896        for typecode in typecodes:
897            a = array.array(typecode)
898            self.assertIsNone(re.compile("bla").match(a))
899            self.assertEqual(re.compile("").match(a).groups(), ())
900
901    @requires_unicode
902    def test_inline_flags(self):
903        # Bug #1700
904        upper_char = unichr(0x1ea0) # Latin Capital Letter A with Dot Bellow
905        lower_char = unichr(0x1ea1) # Latin Small Letter A with Dot Bellow
906
907        p = re.compile(upper_char, re.I | re.U)
908        q = p.match(lower_char)
909        self.assertTrue(q)
910
911        p = re.compile(lower_char, re.I | re.U)
912        q = p.match(upper_char)
913        self.assertTrue(q)
914
915        p = re.compile('(?i)' + upper_char, re.U)
916        q = p.match(lower_char)
917        self.assertTrue(q)
918
919        p = re.compile('(?i)' + lower_char, re.U)
920        q = p.match(upper_char)
921        self.assertTrue(q)
922
923        p = re.compile('(?iu)' + upper_char)
924        q = p.match(lower_char)
925        self.assertTrue(q)
926
927        p = re.compile('(?iu)' + lower_char)
928        q = p.match(upper_char)
929        self.assertTrue(q)
930
931        self.assertTrue(re.match('(?ixu) ' + upper_char, lower_char))
932        self.assertTrue(re.match('(?ixu) ' + lower_char, upper_char))
933
934    def test_dollar_matches_twice(self):
935        "$ matches the end of string, and just before the terminating \n"
936        pattern = re.compile('$')
937        self.assertEqual(pattern.sub('#', 'a\nb\n'), 'a\nb#\n#')
938        self.assertEqual(pattern.sub('#', 'a\nb\nc'), 'a\nb\nc#')
939        self.assertEqual(pattern.sub('#', '\n'), '#\n#')
940
941        pattern = re.compile('$', re.MULTILINE)
942        self.assertEqual(pattern.sub('#', 'a\nb\n' ), 'a#\nb#\n#' )
943        self.assertEqual(pattern.sub('#', 'a\nb\nc'), 'a#\nb#\nc#')
944        self.assertEqual(pattern.sub('#', '\n'), '#\n#')
945
946    def test_dealloc(self):
947        # issue 3299: check for segfault in debug build
948        import _sre
949        # the overflow limit is different on wide and narrow builds and it
950        # depends on the definition of SRE_CODE (see sre.h).
951        # 2**128 should be big enough to overflow on both. For smaller values
952        # a RuntimeError is raised instead of OverflowError.
953        long_overflow = 2**128
954        self.assertRaises(TypeError, re.finditer, "a", {})
955        self.assertRaises(OverflowError, _sre.compile, "abc", 0, [long_overflow])
956
957    def test_compile(self):
958        # Test return value when given string and pattern as parameter
959        pattern = re.compile('random pattern')
960        self.assertIsInstance(pattern, re._pattern_type)
961        same_pattern = re.compile(pattern)
962        self.assertIsInstance(same_pattern, re._pattern_type)
963        self.assertIs(same_pattern, pattern)
964        # Test behaviour when not given a string or pattern as parameter
965        self.assertRaises(TypeError, re.compile, 0)
966
967    def test_bug_13899(self):
968        # Issue #13899: re pattern r"[\A]" should work like "A" but matches
969        # nothing. Ditto B and Z.
970        self.assertEqual(re.findall(r'[\A\B\b\C\Z]', 'AB\bCZ'),
971                         ['A', 'B', '\b', 'C', 'Z'])
972
973    @precisionbigmemtest(size=_2G, memuse=1)
974    def test_large_search(self, size):
975        # Issue #10182: indices were 32-bit-truncated.
976        s = 'a' * size
977        m = re.search('$', s)
978        self.assertIsNotNone(m)
979        self.assertEqual(m.start(), size)
980        self.assertEqual(m.end(), size)
981
982    # The huge memuse is because of re.sub() using a list and a join()
983    # to create the replacement result.
984    @precisionbigmemtest(size=_2G, memuse=16 + 2)
985    def test_large_subn(self, size):
986        # Issue #10182: indices were 32-bit-truncated.
987        s = 'a' * size
988        r, n = re.subn('', '', s)
989        self.assertEqual(r, s)
990        self.assertEqual(n, size + 1)
991
992
993    def test_repeat_minmax_overflow(self):
994        # Issue #13169
995        string = "x" * 100000
996        self.assertEqual(re.match(r".{65535}", string).span(), (0, 65535))
997        self.assertEqual(re.match(r".{,65535}", string).span(), (0, 65535))
998        self.assertEqual(re.match(r".{65535,}?", string).span(), (0, 65535))
999        self.assertEqual(re.match(r".{65536}", string).span(), (0, 65536))
1000        self.assertEqual(re.match(r".{,65536}", string).span(), (0, 65536))
1001        self.assertEqual(re.match(r".{65536,}?", string).span(), (0, 65536))
1002        # 2**128 should be big enough to overflow both SRE_CODE and Py_ssize_t.
1003        self.assertRaises(OverflowError, re.compile, r".{%d}" % 2**128)
1004        self.assertRaises(OverflowError, re.compile, r".{,%d}" % 2**128)
1005        self.assertRaises(OverflowError, re.compile, r".{%d,}?" % 2**128)
1006        self.assertRaises(OverflowError, re.compile, r".{%d,%d}" % (2**129, 2**128))
1007
1008    @cpython_only
1009    def test_repeat_minmax_overflow_maxrepeat(self):
1010        try:
1011            from _sre import MAXREPEAT
1012        except ImportError:
1013            self.skipTest('requires _sre.MAXREPEAT constant')
1014        string = "x" * 100000
1015        self.assertIsNone(re.match(r".{%d}" % (MAXREPEAT - 1), string))
1016        self.assertEqual(re.match(r".{,%d}" % (MAXREPEAT - 1), string).span(),
1017                         (0, 100000))
1018        self.assertIsNone(re.match(r".{%d,}?" % (MAXREPEAT - 1), string))
1019        self.assertRaises(OverflowError, re.compile, r".{%d}" % MAXREPEAT)
1020        self.assertRaises(OverflowError, re.compile, r".{,%d}" % MAXREPEAT)
1021        self.assertRaises(OverflowError, re.compile, r".{%d,}?" % MAXREPEAT)
1022
1023    def test_backref_group_name_in_exception(self):
1024        # Issue 17341: Poor error message when compiling invalid regex
1025        with self.assertRaisesRegexp(sre_constants.error, '<foo>'):
1026            re.compile('(?P=<foo>)')
1027
1028    def test_group_name_in_exception(self):
1029        # Issue 17341: Poor error message when compiling invalid regex
1030        with self.assertRaisesRegexp(sre_constants.error, '\?foo'):
1031            re.compile('(?P<?foo>)')
1032
1033    def test_issue17998(self):
1034        for reps in '*', '+', '?', '{1}':
1035            for mod in '', '?':
1036                pattern = '.' + reps + mod + 'yz'
1037                self.assertEqual(re.compile(pattern, re.S).findall('xyz'),
1038                                 ['xyz'], msg=pattern)
1039                if have_unicode:
1040                    pattern = unicode(pattern)
1041                    self.assertEqual(re.compile(pattern, re.S).findall(u'xyz'),
1042                                     [u'xyz'], msg=pattern)
1043
1044
1045    def test_bug_2537(self):
1046        # issue 2537: empty submatches
1047        for outer_op in ('{0,}', '*', '+', '{1,187}'):
1048            for inner_op in ('{0,}', '*', '?'):
1049                r = re.compile("^((x|y)%s)%s" % (inner_op, outer_op))
1050                m = r.match("xyyzy")
1051                self.assertEqual(m.group(0), "xyy")
1052                self.assertEqual(m.group(1), "")
1053                self.assertEqual(m.group(2), "y")
1054
1055    def test_debug_flag(self):
1056        pat = r'(\.)(?:[ch]|py)(?(1)$|: )'
1057        with captured_stdout() as out:
1058            re.compile(pat, re.DEBUG)
1059        dump = '''\
1060subpattern 1
1061  literal 46
1062subpattern None
1063  branch
1064    in
1065      literal 99
1066      literal 104
1067  or
1068    literal 112
1069    literal 121
1070subpattern None
1071  groupref_exists 1
1072    at at_end
1073  else
1074    literal 58
1075    literal 32
1076'''
1077        self.assertEqual(out.getvalue(), dump)
1078        # Debug output is output again even a second time (bypassing
1079        # the cache -- issue #20426).
1080        with captured_stdout() as out:
1081            re.compile(pat, re.DEBUG)
1082        self.assertEqual(out.getvalue(), dump)
1083
1084    def test_keyword_parameters(self):
1085        # Issue #20283: Accepting the string keyword parameter.
1086        pat = re.compile(r'(ab)')
1087        self.assertEqual(
1088            pat.match(string='abracadabra', pos=7, endpos=10).span(), (7, 9))
1089        self.assertEqual(
1090            pat.search(string='abracadabra', pos=3, endpos=10).span(), (7, 9))
1091        self.assertEqual(
1092            pat.findall(string='abracadabra', pos=3, endpos=10), ['ab'])
1093        self.assertEqual(
1094            pat.split(string='abracadabra', maxsplit=1),
1095            ['', 'ab', 'racadabra'])
1096
1097    def test_match_group_takes_long(self):
1098        self.assertEqual(re.match("(foo)", "foo").group(1L), "foo")
1099        self.assertRaises(IndexError, re.match("", "").group, sys.maxint + 1)
1100
1101    def test_locale_caching(self):
1102        # Issue #22410
1103        oldlocale = locale.setlocale(locale.LC_CTYPE)
1104        self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale)
1105        for loc in 'en_US.iso88591', 'en_US.utf8':
1106            try:
1107                locale.setlocale(locale.LC_CTYPE, loc)
1108            except locale.Error:
1109                # Unsupported locale on this system
1110                self.skipTest('test needs %s locale' % loc)
1111
1112        re.purge()
1113        self.check_en_US_iso88591()
1114        self.check_en_US_utf8()
1115        re.purge()
1116        self.check_en_US_utf8()
1117        self.check_en_US_iso88591()
1118
1119    def check_en_US_iso88591(self):
1120        locale.setlocale(locale.LC_CTYPE, 'en_US.iso88591')
1121        self.assertTrue(re.match(b'\xc5\xe5', b'\xc5\xe5', re.L|re.I))
1122        self.assertTrue(re.match(b'\xc5', b'\xe5', re.L|re.I))
1123        self.assertTrue(re.match(b'\xe5', b'\xc5', re.L|re.I))
1124        self.assertTrue(re.match(b'(?Li)\xc5\xe5', b'\xc5\xe5'))
1125        self.assertTrue(re.match(b'(?Li)\xc5', b'\xe5'))
1126        self.assertTrue(re.match(b'(?Li)\xe5', b'\xc5'))
1127
1128    def check_en_US_utf8(self):
1129        locale.setlocale(locale.LC_CTYPE, 'en_US.utf8')
1130        self.assertTrue(re.match(b'\xc5\xe5', b'\xc5\xe5', re.L|re.I))
1131        self.assertIsNone(re.match(b'\xc5', b'\xe5', re.L|re.I))
1132        self.assertIsNone(re.match(b'\xe5', b'\xc5', re.L|re.I))
1133        self.assertTrue(re.match(b'(?Li)\xc5\xe5', b'\xc5\xe5'))
1134        self.assertIsNone(re.match(b'(?Li)\xc5', b'\xe5'))
1135        self.assertIsNone(re.match(b'(?Li)\xe5', b'\xc5'))
1136
1137
1138def run_re_tests():
1139    from test.re_tests import tests, SUCCEED, FAIL, SYNTAX_ERROR
1140    if verbose:
1141        print 'Running re_tests test suite'
1142    else:
1143        # To save time, only run the first and last 10 tests
1144        #tests = tests[:10] + tests[-10:]
1145        pass
1146
1147    for t in tests:
1148        sys.stdout.flush()
1149        pattern = s = outcome = repl = expected = None
1150        if len(t) == 5:
1151            pattern, s, outcome, repl, expected = t
1152        elif len(t) == 3:
1153            pattern, s, outcome = t
1154        else:
1155            raise ValueError, ('Test tuples should have 3 or 5 fields', t)
1156
1157        try:
1158            obj = re.compile(pattern)
1159        except re.error:
1160            if outcome == SYNTAX_ERROR: pass  # Expected a syntax error
1161            else:
1162                print '=== Syntax error:', t
1163        except KeyboardInterrupt: raise KeyboardInterrupt
1164        except:
1165            print '*** Unexpected error ***', t
1166            if verbose:
1167                traceback.print_exc(file=sys.stdout)
1168        else:
1169            try:
1170                result = obj.search(s)
1171            except re.error, msg:
1172                print '=== Unexpected exception', t, repr(msg)
1173            if outcome == SYNTAX_ERROR:
1174                # This should have been a syntax error; forget it.
1175                pass
1176            elif outcome == FAIL:
1177                if result is None: pass   # No match, as expected
1178                else: print '=== Succeeded incorrectly', t
1179            elif outcome == SUCCEED:
1180                if result is not None:
1181                    # Matched, as expected, so now we compute the
1182                    # result string and compare it to our expected result.
1183                    start, end = result.span(0)
1184                    vardict={'found': result.group(0),
1185                             'groups': result.group(),
1186                             'flags': result.re.flags}
1187                    for i in range(1, 100):
1188                        try:
1189                            gi = result.group(i)
1190                            # Special hack because else the string concat fails:
1191                            if gi is None:
1192                                gi = "None"
1193                        except IndexError:
1194                            gi = "Error"
1195                        vardict['g%d' % i] = gi
1196                    for i in result.re.groupindex.keys():
1197                        try:
1198                            gi = result.group(i)
1199                            if gi is None:
1200                                gi = "None"
1201                        except IndexError:
1202                            gi = "Error"
1203                        vardict[i] = gi
1204                    repl = eval(repl, vardict)
1205                    if repl != expected:
1206                        print '=== grouping error', t,
1207                        print repr(repl) + ' should be ' + repr(expected)
1208                else:
1209                    print '=== Failed incorrectly', t
1210
1211                # Try the match on a unicode string, and check that it
1212                # still succeeds.
1213                try:
1214                    result = obj.search(unicode(s, "latin-1"))
1215                    if result is None:
1216                        print '=== Fails on unicode match', t
1217                except NameError:
1218                    continue # 1.5.2
1219                except TypeError:
1220                    continue # unicode test case
1221
1222                # Try the match on a unicode pattern, and check that it
1223                # still succeeds.
1224                obj=re.compile(unicode(pattern, "latin-1"))
1225                result = obj.search(s)
1226                if result is None:
1227                    print '=== Fails on unicode pattern match', t
1228
1229                # Try the match with the search area limited to the extent
1230                # of the match and see if it still succeeds.  \B will
1231                # break (because it won't match at the end or start of a
1232                # string), so we'll ignore patterns that feature it.
1233
1234                if pattern[:2] != '\\B' and pattern[-2:] != '\\B' \
1235                               and result is not None:
1236                    obj = re.compile(pattern)
1237                    result = obj.search(s, result.start(0), result.end(0) + 1)
1238                    if result is None:
1239                        print '=== Failed on range-limited match', t
1240
1241                # Try the match with IGNORECASE enabled, and check that it
1242                # still succeeds.
1243                obj = re.compile(pattern, re.IGNORECASE)
1244                result = obj.search(s)
1245                if result is None:
1246                    print '=== Fails on case-insensitive match', t
1247
1248                # Try the match with LOCALE enabled, and check that it
1249                # still succeeds.
1250                obj = re.compile(pattern, re.LOCALE)
1251                result = obj.search(s)
1252                if result is None:
1253                    print '=== Fails on locale-sensitive match', t
1254
1255                # Try the match with UNICODE locale enabled, and check
1256                # that it still succeeds.
1257                obj = re.compile(pattern, re.UNICODE)
1258                result = obj.search(s)
1259                if result is None:
1260                    print '=== Fails on unicode-sensitive match', t
1261
1262def test_main():
1263    run_unittest(ReTests)
1264    run_re_tests()
1265
1266if __name__ == "__main__":
1267    test_main()
1268