1# -*- coding: utf-8 -*- 2from test.test_support import ( 3 verbose, run_unittest, import_module, 4 precisionbigmemtest, _2G, cpython_only, 5 captured_stdout, have_unicode, requires_unicode, u, 6 check_warnings) 7import locale 8import re 9from re import Scanner 10import sre_constants 11import sys 12import string 13import traceback 14from weakref import proxy 15 16 17# Misc tests from Tim Peters' re.doc 18 19# WARNING: Don't change details in these tests if you don't know 20# what you're doing. Some of these tests were carefully modeled to 21# cover most of the code. 22 23import unittest 24 25class ReTests(unittest.TestCase): 26 27 def test_weakref(self): 28 s = 'QabbbcR' 29 x = re.compile('ab+c') 30 y = proxy(x) 31 self.assertEqual(x.findall('QabbbcR'), y.findall('QabbbcR')) 32 33 def test_search_star_plus(self): 34 self.assertEqual(re.search('x*', 'axx').span(0), (0, 0)) 35 self.assertEqual(re.search('x*', 'axx').span(), (0, 0)) 36 self.assertEqual(re.search('x+', 'axx').span(0), (1, 3)) 37 self.assertEqual(re.search('x+', 'axx').span(), (1, 3)) 38 self.assertIsNone(re.search('x', 'aaa')) 39 self.assertEqual(re.match('a*', 'xxx').span(0), (0, 0)) 40 self.assertEqual(re.match('a*', 'xxx').span(), (0, 0)) 41 self.assertEqual(re.match('x*', 'xxxa').span(0), (0, 3)) 42 self.assertEqual(re.match('x*', 'xxxa').span(), (0, 3)) 43 self.assertIsNone(re.match('a+', 'xxx')) 44 45 def bump_num(self, matchobj): 46 int_value = int(matchobj.group(0)) 47 return str(int_value + 1) 48 49 def test_basic_re_sub(self): 50 self.assertEqual(re.sub("(?i)b+", "x", "bbbb BBBB"), 'x x') 51 self.assertEqual(re.sub(r'\d+', self.bump_num, '08.2 -2 23x99y'), 52 '9.3 -3 24x100y') 53 self.assertEqual(re.sub(r'\d+', self.bump_num, '08.2 -2 23x99y', 3), 54 '9.3 -3 23x99y') 55 56 self.assertEqual(re.sub('.', lambda m: r"\n", 'x'), '\\n') 57 self.assertEqual(re.sub('.', r"\n", 'x'), '\n') 58 59 s = r"\1\1" 60 self.assertEqual(re.sub('(.)', s, 'x'), 'xx') 61 self.assertEqual(re.sub('(.)', re.escape(s), 'x'), s) 62 self.assertEqual(re.sub('(.)', lambda m: s, 'x'), s) 63 64 self.assertEqual(re.sub('(?P<a>x)', '\g<a>\g<a>', 'xx'), 'xxxx') 65 self.assertEqual(re.sub('(?P<a>x)', '\g<a>\g<1>', 'xx'), 'xxxx') 66 self.assertEqual(re.sub('(?P<unk>x)', '\g<unk>\g<unk>', 'xx'), 'xxxx') 67 self.assertEqual(re.sub('(?P<unk>x)', '\g<1>\g<1>', 'xx'), 'xxxx') 68 69 self.assertEqual(re.sub('a',r'\t\n\v\r\f\a\b\B\Z\a\A\w\W\s\S\d\D','a'), 70 '\t\n\v\r\f\a\b\\B\\Z\a\\A\\w\\W\\s\\S\\d\\D') 71 self.assertEqual(re.sub('a', '\t\n\v\r\f\a', 'a'), '\t\n\v\r\f\a') 72 self.assertEqual(re.sub('a', '\t\n\v\r\f\a', 'a'), 73 (chr(9)+chr(10)+chr(11)+chr(13)+chr(12)+chr(7))) 74 75 self.assertEqual(re.sub('^\s*', 'X', 'test'), 'Xtest') 76 77 def test_bug_449964(self): 78 # fails for group followed by other escape 79 self.assertEqual(re.sub(r'(?P<unk>x)', '\g<1>\g<1>\\b', 'xx'), 80 'xx\bxx\b') 81 82 def test_bug_449000(self): 83 # Test for sub() on escaped characters 84 self.assertEqual(re.sub(r'\r\n', r'\n', 'abc\r\ndef\r\n'), 85 'abc\ndef\n') 86 self.assertEqual(re.sub('\r\n', r'\n', 'abc\r\ndef\r\n'), 87 'abc\ndef\n') 88 self.assertEqual(re.sub(r'\r\n', '\n', 'abc\r\ndef\r\n'), 89 'abc\ndef\n') 90 self.assertEqual(re.sub('\r\n', '\n', 'abc\r\ndef\r\n'), 91 'abc\ndef\n') 92 93 @requires_unicode 94 def test_bug_1140(self): 95 # re.sub(x, y, u'') should return u'', not '', and 96 # re.sub(x, y, '') should return '', not u''. 97 # Also: 98 # re.sub(x, y, unicode(x)) should return unicode(y), and 99 # re.sub(x, y, str(x)) should return 100 # str(y) if isinstance(y, str) else unicode(y). 101 for x in 'x', u'x': 102 for y in 'y', u'y': 103 z = re.sub(x, y, u'') 104 self.assertEqual(z, u'') 105 self.assertEqual(type(z), unicode) 106 # 107 z = re.sub(x, y, '') 108 self.assertEqual(z, '') 109 self.assertEqual(type(z), str) 110 # 111 z = re.sub(x, y, unicode(x)) 112 self.assertEqual(z, y) 113 self.assertEqual(type(z), unicode) 114 # 115 z = re.sub(x, y, str(x)) 116 self.assertEqual(z, y) 117 self.assertEqual(type(z), type(y)) 118 119 def test_bug_1661(self): 120 # Verify that flags do not get silently ignored with compiled patterns 121 pattern = re.compile('.') 122 self.assertRaises(ValueError, re.match, pattern, 'A', re.I) 123 self.assertRaises(ValueError, re.search, pattern, 'A', re.I) 124 self.assertRaises(ValueError, re.findall, pattern, 'A', re.I) 125 self.assertRaises(ValueError, re.compile, pattern, re.I) 126 127 def test_bug_3629(self): 128 # A regex that triggered a bug in the sre-code validator 129 re.compile("(?P<quote>)(?(quote))") 130 131 def test_sub_template_numeric_escape(self): 132 # bug 776311 and friends 133 self.assertEqual(re.sub('x', r'\0', 'x'), '\0') 134 self.assertEqual(re.sub('x', r'\000', 'x'), '\000') 135 self.assertEqual(re.sub('x', r'\001', 'x'), '\001') 136 self.assertEqual(re.sub('x', r'\008', 'x'), '\0' + '8') 137 self.assertEqual(re.sub('x', r'\009', 'x'), '\0' + '9') 138 self.assertEqual(re.sub('x', r'\111', 'x'), '\111') 139 self.assertEqual(re.sub('x', r'\117', 'x'), '\117') 140 141 self.assertEqual(re.sub('x', r'\1111', 'x'), '\1111') 142 self.assertEqual(re.sub('x', r'\1111', 'x'), '\111' + '1') 143 144 self.assertEqual(re.sub('x', r'\00', 'x'), '\x00') 145 self.assertEqual(re.sub('x', r'\07', 'x'), '\x07') 146 self.assertEqual(re.sub('x', r'\08', 'x'), '\0' + '8') 147 self.assertEqual(re.sub('x', r'\09', 'x'), '\0' + '9') 148 self.assertEqual(re.sub('x', r'\0a', 'x'), '\0' + 'a') 149 150 self.assertEqual(re.sub('x', r'\400', 'x'), '\0') 151 self.assertEqual(re.sub('x', r'\777', 'x'), '\377') 152 153 self.assertRaises(re.error, re.sub, 'x', r'\1', 'x') 154 self.assertRaises(re.error, re.sub, 'x', r'\8', 'x') 155 self.assertRaises(re.error, re.sub, 'x', r'\9', 'x') 156 self.assertRaises(re.error, re.sub, 'x', r'\11', 'x') 157 self.assertRaises(re.error, re.sub, 'x', r'\18', 'x') 158 self.assertRaises(re.error, re.sub, 'x', r'\1a', 'x') 159 self.assertRaises(re.error, re.sub, 'x', r'\90', 'x') 160 self.assertRaises(re.error, re.sub, 'x', r'\99', 'x') 161 self.assertRaises(re.error, re.sub, 'x', r'\118', 'x') # r'\11' + '8' 162 self.assertRaises(re.error, re.sub, 'x', r'\11a', 'x') 163 self.assertRaises(re.error, re.sub, 'x', r'\181', 'x') # r'\18' + '1' 164 self.assertRaises(re.error, re.sub, 'x', r'\800', 'x') # r'\80' + '0' 165 166 # in python2.3 (etc), these loop endlessly in sre_parser.py 167 self.assertEqual(re.sub('(((((((((((x)))))))))))', r'\11', 'x'), 'x') 168 self.assertEqual(re.sub('((((((((((y))))))))))(.)', r'\118', 'xyz'), 169 'xz8') 170 self.assertEqual(re.sub('((((((((((y))))))))))(.)', r'\11a', 'xyz'), 171 'xza') 172 173 def test_qualified_re_sub(self): 174 self.assertEqual(re.sub('a', 'b', 'aaaaa'), 'bbbbb') 175 self.assertEqual(re.sub('a', 'b', 'aaaaa', 1), 'baaaa') 176 177 def test_bug_114660(self): 178 self.assertEqual(re.sub(r'(\S)\s+(\S)', r'\1 \2', 'hello there'), 179 'hello there') 180 181 def test_bug_462270(self): 182 # Test for empty sub() behaviour, see SF bug #462270 183 self.assertEqual(re.sub('x*', '-', 'abxd'), '-a-b-d-') 184 self.assertEqual(re.sub('x+', '-', 'abxd'), 'ab-d') 185 186 def test_symbolic_groups(self): 187 re.compile('(?P<a>x)(?P=a)(?(a)y)') 188 re.compile('(?P<a1>x)(?P=a1)(?(a1)y)') 189 self.assertRaises(re.error, re.compile, '(?P<a>)(?P<a>)') 190 self.assertRaises(re.error, re.compile, '(?Px)') 191 self.assertRaises(re.error, re.compile, '(?P=)') 192 self.assertRaises(re.error, re.compile, '(?P=1)') 193 self.assertRaises(re.error, re.compile, '(?P=a)') 194 self.assertRaises(re.error, re.compile, '(?P=a1)') 195 self.assertRaises(re.error, re.compile, '(?P=a.)') 196 self.assertRaises(re.error, re.compile, '(?P<)') 197 self.assertRaises(re.error, re.compile, '(?P<>)') 198 self.assertRaises(re.error, re.compile, '(?P<1>)') 199 self.assertRaises(re.error, re.compile, '(?P<a.>)') 200 self.assertRaises(re.error, re.compile, '(?())') 201 self.assertRaises(re.error, re.compile, '(?(a))') 202 self.assertRaises(re.error, re.compile, '(?(1a))') 203 self.assertRaises(re.error, re.compile, '(?(a.))') 204 205 def test_symbolic_refs(self): 206 self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<a', 'xx') 207 self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<', 'xx') 208 self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g', 'xx') 209 self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<a a>', 'xx') 210 self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<>', 'xx') 211 self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<1a1>', 'xx') 212 self.assertRaises(IndexError, re.sub, '(?P<a>x)', '\g<ab>', 'xx') 213 self.assertRaises(re.error, re.sub, '(?P<a>x)|(?P<b>y)', '\g<b>', 'xx') 214 self.assertRaises(re.error, re.sub, '(?P<a>x)|(?P<b>y)', '\\2', 'xx') 215 self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<-1>', 'xx') 216 217 def test_re_subn(self): 218 self.assertEqual(re.subn("(?i)b+", "x", "bbbb BBBB"), ('x x', 2)) 219 self.assertEqual(re.subn("b+", "x", "bbbb BBBB"), ('x BBBB', 1)) 220 self.assertEqual(re.subn("b+", "x", "xyz"), ('xyz', 0)) 221 self.assertEqual(re.subn("b*", "x", "xyz"), ('xxxyxzx', 4)) 222 self.assertEqual(re.subn("b*", "x", "xyz", 2), ('xxxyz', 2)) 223 224 def test_re_split(self): 225 self.assertEqual(re.split(":", ":a:b::c"), ['', 'a', 'b', '', 'c']) 226 self.assertEqual(re.split(":*", ":a:b::c"), ['', 'a', 'b', 'c']) 227 self.assertEqual(re.split("(:*)", ":a:b::c"), 228 ['', ':', 'a', ':', 'b', '::', 'c']) 229 self.assertEqual(re.split("(?::*)", ":a:b::c"), ['', 'a', 'b', 'c']) 230 self.assertEqual(re.split("(:)*", ":a:b::c"), 231 ['', ':', 'a', ':', 'b', ':', 'c']) 232 self.assertEqual(re.split("([b:]+)", ":a:b::c"), 233 ['', ':', 'a', ':b::', 'c']) 234 self.assertEqual(re.split("(b)|(:+)", ":a:b::c"), 235 ['', None, ':', 'a', None, ':', '', 'b', None, '', 236 None, '::', 'c']) 237 self.assertEqual(re.split("(?:b)|(?::+)", ":a:b::c"), 238 ['', 'a', '', '', 'c']) 239 240 def test_qualified_re_split(self): 241 self.assertEqual(re.split(":", ":a:b::c", 2), ['', 'a', 'b::c']) 242 self.assertEqual(re.split(':', 'a:b:c:d', 2), ['a', 'b', 'c:d']) 243 self.assertEqual(re.split("(:)", ":a:b::c", 2), 244 ['', ':', 'a', ':', 'b::c']) 245 self.assertEqual(re.split("(:*)", ":a:b::c", 2), 246 ['', ':', 'a', ':', 'b::c']) 247 248 def test_re_findall(self): 249 self.assertEqual(re.findall(":+", "abc"), []) 250 self.assertEqual(re.findall(":+", "a:b::c:::d"), [":", "::", ":::"]) 251 self.assertEqual(re.findall("(:+)", "a:b::c:::d"), [":", "::", ":::"]) 252 self.assertEqual(re.findall("(:)(:*)", "a:b::c:::d"), [(":", ""), 253 (":", ":"), 254 (":", "::")]) 255 256 def test_bug_117612(self): 257 self.assertEqual(re.findall(r"(a|(b))", "aba"), 258 [("a", ""),("b", "b"),("a", "")]) 259 260 def test_re_match(self): 261 self.assertEqual(re.match('a', 'a').groups(), ()) 262 self.assertEqual(re.match('(a)', 'a').groups(), ('a',)) 263 self.assertEqual(re.match(r'(a)', 'a').group(0), 'a') 264 self.assertEqual(re.match(r'(a)', 'a').group(1), 'a') 265 self.assertEqual(re.match(r'(a)', 'a').group(1, 1), ('a', 'a')) 266 267 pat = re.compile('((a)|(b))(c)?') 268 self.assertEqual(pat.match('a').groups(), ('a', 'a', None, None)) 269 self.assertEqual(pat.match('b').groups(), ('b', None, 'b', None)) 270 self.assertEqual(pat.match('ac').groups(), ('a', 'a', None, 'c')) 271 self.assertEqual(pat.match('bc').groups(), ('b', None, 'b', 'c')) 272 self.assertEqual(pat.match('bc').groups(""), ('b', "", 'b', 'c')) 273 274 # A single group 275 m = re.match('(a)', 'a') 276 self.assertEqual(m.group(0), 'a') 277 self.assertEqual(m.group(0), 'a') 278 self.assertEqual(m.group(1), 'a') 279 self.assertEqual(m.group(1, 1), ('a', 'a')) 280 281 pat = re.compile('(?:(?P<a1>a)|(?P<b2>b))(?P<c3>c)?') 282 self.assertEqual(pat.match('a').group(1, 2, 3), ('a', None, None)) 283 self.assertEqual(pat.match('b').group('a1', 'b2', 'c3'), 284 (None, 'b', None)) 285 self.assertEqual(pat.match('ac').group(1, 'b2', 3), ('a', None, 'c')) 286 287 def test_re_groupref_exists(self): 288 self.assertEqual(re.match('^(\()?([^()]+)(?(1)\))$', '(a)').groups(), 289 ('(', 'a')) 290 self.assertEqual(re.match('^(\()?([^()]+)(?(1)\))$', 'a').groups(), 291 (None, 'a')) 292 self.assertIsNone(re.match('^(\()?([^()]+)(?(1)\))$', 'a)')) 293 self.assertIsNone(re.match('^(\()?([^()]+)(?(1)\))$', '(a')) 294 self.assertEqual(re.match('^(?:(a)|c)((?(1)b|d))$', 'ab').groups(), 295 ('a', 'b')) 296 self.assertEqual(re.match('^(?:(a)|c)((?(1)b|d))$', 'cd').groups(), 297 (None, 'd')) 298 self.assertEqual(re.match('^(?:(a)|c)((?(1)|d))$', 'cd').groups(), 299 (None, 'd')) 300 self.assertEqual(re.match('^(?:(a)|c)((?(1)|d))$', 'a').groups(), 301 ('a', '')) 302 303 # Tests for bug #1177831: exercise groups other than the first group 304 p = re.compile('(?P<g1>a)(?P<g2>b)?((?(g2)c|d))') 305 self.assertEqual(p.match('abc').groups(), 306 ('a', 'b', 'c')) 307 self.assertEqual(p.match('ad').groups(), 308 ('a', None, 'd')) 309 self.assertIsNone(p.match('abd')) 310 self.assertIsNone(p.match('ac')) 311 312 313 def test_re_groupref(self): 314 self.assertEqual(re.match(r'^(\|)?([^()]+)\1$', '|a|').groups(), 315 ('|', 'a')) 316 self.assertEqual(re.match(r'^(\|)?([^()]+)\1?$', 'a').groups(), 317 (None, 'a')) 318 self.assertIsNone(re.match(r'^(\|)?([^()]+)\1$', 'a|')) 319 self.assertIsNone(re.match(r'^(\|)?([^()]+)\1$', '|a')) 320 self.assertEqual(re.match(r'^(?:(a)|c)(\1)$', 'aa').groups(), 321 ('a', 'a')) 322 self.assertEqual(re.match(r'^(?:(a)|c)(\1)?$', 'c').groups(), 323 (None, None)) 324 325 def test_groupdict(self): 326 self.assertEqual(re.match('(?P<first>first) (?P<second>second)', 327 'first second').groupdict(), 328 {'first':'first', 'second':'second'}) 329 330 def test_expand(self): 331 self.assertEqual(re.match("(?P<first>first) (?P<second>second)", 332 "first second") 333 .expand(r"\2 \1 \g<second> \g<first>"), 334 "second first second first") 335 336 def test_repeat_minmax(self): 337 self.assertIsNone(re.match("^(\w){1}$", "abc")) 338 self.assertIsNone(re.match("^(\w){1}?$", "abc")) 339 self.assertIsNone(re.match("^(\w){1,2}$", "abc")) 340 self.assertIsNone(re.match("^(\w){1,2}?$", "abc")) 341 342 self.assertEqual(re.match("^(\w){3}$", "abc").group(1), "c") 343 self.assertEqual(re.match("^(\w){1,3}$", "abc").group(1), "c") 344 self.assertEqual(re.match("^(\w){1,4}$", "abc").group(1), "c") 345 self.assertEqual(re.match("^(\w){3,4}?$", "abc").group(1), "c") 346 self.assertEqual(re.match("^(\w){3}?$", "abc").group(1), "c") 347 self.assertEqual(re.match("^(\w){1,3}?$", "abc").group(1), "c") 348 self.assertEqual(re.match("^(\w){1,4}?$", "abc").group(1), "c") 349 self.assertEqual(re.match("^(\w){3,4}?$", "abc").group(1), "c") 350 351 self.assertIsNone(re.match("^x{1}$", "xxx")) 352 self.assertIsNone(re.match("^x{1}?$", "xxx")) 353 self.assertIsNone(re.match("^x{1,2}$", "xxx")) 354 self.assertIsNone(re.match("^x{1,2}?$", "xxx")) 355 356 self.assertTrue(re.match("^x{3}$", "xxx")) 357 self.assertTrue(re.match("^x{1,3}$", "xxx")) 358 self.assertTrue(re.match("^x{1,4}$", "xxx")) 359 self.assertTrue(re.match("^x{3,4}?$", "xxx")) 360 self.assertTrue(re.match("^x{3}?$", "xxx")) 361 self.assertTrue(re.match("^x{1,3}?$", "xxx")) 362 self.assertTrue(re.match("^x{1,4}?$", "xxx")) 363 self.assertTrue(re.match("^x{3,4}?$", "xxx")) 364 365 self.assertIsNone(re.match("^x{}$", "xxx")) 366 self.assertTrue(re.match("^x{}$", "x{}")) 367 368 def test_getattr(self): 369 self.assertEqual(re.match("(a)", "a").pos, 0) 370 self.assertEqual(re.match("(a)", "a").endpos, 1) 371 self.assertEqual(re.match("(a)", "a").string, "a") 372 self.assertEqual(re.match("(a)", "a").regs, ((0, 1), (0, 1))) 373 self.assertTrue(re.match("(a)", "a").re) 374 375 def test_special_escapes(self): 376 self.assertEqual(re.search(r"\b(b.)\b", 377 "abcd abc bcd bx").group(1), "bx") 378 self.assertEqual(re.search(r"\B(b.)\B", 379 "abc bcd bc abxd").group(1), "bx") 380 self.assertEqual(re.search(r"\b(b.)\b", 381 "abcd abc bcd bx", re.LOCALE).group(1), "bx") 382 self.assertEqual(re.search(r"\B(b.)\B", 383 "abc bcd bc abxd", re.LOCALE).group(1), "bx") 384 if have_unicode: 385 self.assertEqual(re.search(r"\b(b.)\b", 386 "abcd abc bcd bx", re.UNICODE).group(1), "bx") 387 self.assertEqual(re.search(r"\B(b.)\B", 388 "abc bcd bc abxd", re.UNICODE).group(1), "bx") 389 self.assertEqual(re.search(r"^abc$", "\nabc\n", re.M).group(0), "abc") 390 self.assertEqual(re.search(r"^\Aabc\Z$", "abc", re.M).group(0), "abc") 391 self.assertIsNone(re.search(r"^\Aabc\Z$", "\nabc\n", re.M)) 392 self.assertEqual(re.search(r"\b(b.)\b", 393 u"abcd abc bcd bx").group(1), "bx") 394 self.assertEqual(re.search(r"\B(b.)\B", 395 u"abc bcd bc abxd").group(1), "bx") 396 self.assertEqual(re.search(r"^abc$", u"\nabc\n", re.M).group(0), "abc") 397 self.assertEqual(re.search(r"^\Aabc\Z$", u"abc", re.M).group(0), "abc") 398 self.assertIsNone(re.search(r"^\Aabc\Z$", u"\nabc\n", re.M)) 399 self.assertEqual(re.search(r"\d\D\w\W\s\S", 400 "1aa! a").group(0), "1aa! a") 401 self.assertEqual(re.search(r"\d\D\w\W\s\S", 402 "1aa! a", re.LOCALE).group(0), "1aa! a") 403 if have_unicode: 404 self.assertEqual(re.search(r"\d\D\w\W\s\S", 405 "1aa! a", re.UNICODE).group(0), "1aa! a") 406 407 def test_string_boundaries(self): 408 # See http://bugs.python.org/issue10713 409 self.assertEqual(re.search(r"\b(abc)\b", "abc").group(1), 410 "abc") 411 # There's a word boundary at the start of a string. 412 self.assertTrue(re.match(r"\b", "abc")) 413 # A non-empty string includes a non-boundary zero-length match. 414 self.assertTrue(re.search(r"\B", "abc")) 415 # There is no non-boundary match at the start of a string. 416 self.assertFalse(re.match(r"\B", "abc")) 417 # However, an empty string contains no word boundaries, and also no 418 # non-boundaries. 419 self.assertIsNone(re.search(r"\B", "")) 420 # This one is questionable and different from the perlre behaviour, 421 # but describes current behavior. 422 self.assertIsNone(re.search(r"\b", "")) 423 # A single word-character string has two boundaries, but no 424 # non-boundary gaps. 425 self.assertEqual(len(re.findall(r"\b", "a")), 2) 426 self.assertEqual(len(re.findall(r"\B", "a")), 0) 427 # If there are no words, there are no boundaries 428 self.assertEqual(len(re.findall(r"\b", " ")), 0) 429 self.assertEqual(len(re.findall(r"\b", " ")), 0) 430 # Can match around the whitespace. 431 self.assertEqual(len(re.findall(r"\B", " ")), 2) 432 433 @requires_unicode 434 def test_bigcharset(self): 435 self.assertEqual(re.match(u(r"([\u2222\u2223])"), 436 unichr(0x2222)).group(1), unichr(0x2222)) 437 self.assertEqual(re.match(u(r"([\u2222\u2223])"), 438 unichr(0x2222), re.UNICODE).group(1), unichr(0x2222)) 439 r = u'[%s]' % u''.join(map(unichr, range(256, 2**16, 255))) 440 self.assertEqual(re.match(r, unichr(0xff01), re.UNICODE).group(), unichr(0xff01)) 441 442 def test_big_codesize(self): 443 # Issue #1160 444 r = re.compile('|'.join(('%d'%x for x in range(10000)))) 445 self.assertTrue(r.match('1000')) 446 self.assertTrue(r.match('9999')) 447 448 def test_anyall(self): 449 self.assertEqual(re.match("a.b", "a\nb", re.DOTALL).group(0), 450 "a\nb") 451 self.assertEqual(re.match("a.*b", "a\n\nb", re.DOTALL).group(0), 452 "a\n\nb") 453 454 def test_lookahead(self): 455 self.assertEqual(re.match("(a(?=\s[^a]))", "a b").group(1), "a") 456 self.assertEqual(re.match("(a(?=\s[^a]*))", "a b").group(1), "a") 457 self.assertEqual(re.match("(a(?=\s[abc]))", "a b").group(1), "a") 458 self.assertEqual(re.match("(a(?=\s[abc]*))", "a bc").group(1), "a") 459 self.assertEqual(re.match(r"(a)(?=\s\1)", "a a").group(1), "a") 460 self.assertEqual(re.match(r"(a)(?=\s\1*)", "a aa").group(1), "a") 461 self.assertEqual(re.match(r"(a)(?=\s(abc|a))", "a a").group(1), "a") 462 463 self.assertEqual(re.match(r"(a(?!\s[^a]))", "a a").group(1), "a") 464 self.assertEqual(re.match(r"(a(?!\s[abc]))", "a d").group(1), "a") 465 self.assertEqual(re.match(r"(a)(?!\s\1)", "a b").group(1), "a") 466 self.assertEqual(re.match(r"(a)(?!\s(abc|a))", "a b").group(1), "a") 467 468 # Group reference. 469 self.assertTrue(re.match(r'(a)b(?=\1)a', 'aba')) 470 self.assertIsNone(re.match(r'(a)b(?=\1)c', 'abac')) 471 # Named group reference. 472 self.assertTrue(re.match(r'(?P<g>a)b(?=(?P=g))a', 'aba')) 473 self.assertIsNone(re.match(r'(?P<g>a)b(?=(?P=g))c', 'abac')) 474 # Conditional group reference. 475 self.assertTrue(re.match(r'(?:(a)|(x))b(?=(?(2)x|c))c', 'abc')) 476 self.assertIsNone(re.match(r'(?:(a)|(x))b(?=(?(2)c|x))c', 'abc')) 477 self.assertTrue(re.match(r'(?:(a)|(x))b(?=(?(2)x|c))c', 'abc')) 478 self.assertIsNone(re.match(r'(?:(a)|(x))b(?=(?(1)b|x))c', 'abc')) 479 self.assertTrue(re.match(r'(?:(a)|(x))b(?=(?(1)c|x))c', 'abc')) 480 # Group used before defined. 481 self.assertTrue(re.match(r'(a)b(?=(?(2)x|c))(c)', 'abc')) 482 self.assertIsNone(re.match(r'(a)b(?=(?(2)b|x))(c)', 'abc')) 483 self.assertTrue(re.match(r'(a)b(?=(?(1)c|x))(c)', 'abc')) 484 485 def test_lookbehind(self): 486 self.assertTrue(re.match(r'ab(?<=b)c', 'abc')) 487 self.assertIsNone(re.match(r'ab(?<=c)c', 'abc')) 488 self.assertIsNone(re.match(r'ab(?<!b)c', 'abc')) 489 self.assertTrue(re.match(r'ab(?<!c)c', 'abc')) 490 # Group reference. 491 with check_warnings(('', RuntimeWarning)): 492 re.compile(r'(a)a(?<=\1)c') 493 # Named group reference. 494 with check_warnings(('', RuntimeWarning)): 495 re.compile(r'(?P<g>a)a(?<=(?P=g))c') 496 # Conditional group reference. 497 with check_warnings(('', RuntimeWarning)): 498 re.compile(r'(a)b(?<=(?(1)b|x))c') 499 # Group used before defined. 500 with check_warnings(('', RuntimeWarning)): 501 re.compile(r'(a)b(?<=(?(2)b|x))(c)') 502 503 def test_ignore_case(self): 504 self.assertEqual(re.match("abc", "ABC", re.I).group(0), "ABC") 505 self.assertEqual(re.match("abc", u"ABC", re.I).group(0), "ABC") 506 self.assertEqual(re.match(r"(a\s[^a])", "a b", re.I).group(1), "a b") 507 self.assertEqual(re.match(r"(a\s[^a]*)", "a bb", re.I).group(1), "a bb") 508 self.assertEqual(re.match(r"(a\s[abc])", "a b", re.I).group(1), "a b") 509 self.assertEqual(re.match(r"(a\s[abc]*)", "a bb", re.I).group(1), "a bb") 510 self.assertEqual(re.match(r"((a)\s\2)", "a a", re.I).group(1), "a a") 511 self.assertEqual(re.match(r"((a)\s\2*)", "a aa", re.I).group(1), "a aa") 512 self.assertEqual(re.match(r"((a)\s(abc|a))", "a a", re.I).group(1), "a a") 513 self.assertEqual(re.match(r"((a)\s(abc|a)*)", "a aa", re.I).group(1), "a aa") 514 515 if have_unicode: 516 assert u(r'\u212a').lower() == u'k' # 'K' 517 self.assertTrue(re.match(ur'K', u(r'\u212a'), re.U | re.I)) 518 self.assertTrue(re.match(ur'k', u(r'\u212a'), re.U | re.I)) 519 self.assertTrue(re.match(u(r'\u212a'), u'K', re.U | re.I)) 520 self.assertTrue(re.match(u(r'\u212a'), u'k', re.U | re.I)) 521 assert u(r'\u017f').upper() == u'S' # 'ſ' 522 self.assertTrue(re.match(ur'S', u(r'\u017f'), re.U | re.I)) 523 self.assertTrue(re.match(ur's', u(r'\u017f'), re.U | re.I)) 524 self.assertTrue(re.match(u(r'\u017f'), u'S', re.U | re.I)) 525 self.assertTrue(re.match(u(r'\u017f'), u's', re.U | re.I)) 526 527 def test_ignore_case_set(self): 528 self.assertTrue(re.match(r'[19A]', 'A', re.I)) 529 self.assertTrue(re.match(r'[19a]', 'a', re.I)) 530 self.assertTrue(re.match(r'[19a]', 'A', re.I)) 531 self.assertTrue(re.match(r'[19A]', 'a', re.I)) 532 if have_unicode: 533 self.assertTrue(re.match(ur'[19A]', u'A', re.U | re.I)) 534 self.assertTrue(re.match(ur'[19a]', u'a', re.U | re.I)) 535 self.assertTrue(re.match(ur'[19a]', u'A', re.U | re.I)) 536 self.assertTrue(re.match(ur'[19A]', u'a', re.U | re.I)) 537 assert u(r'\u212a').lower() == u'k' # 'K' 538 self.assertTrue(re.match(u(r'[19K]'), u(r'\u212a'), re.U | re.I)) 539 self.assertTrue(re.match(u(r'[19k]'), u(r'\u212a'), re.U | re.I)) 540 self.assertTrue(re.match(u(r'[19\u212a]'), u'K', re.U | re.I)) 541 self.assertTrue(re.match(u(r'[19\u212a]'), u'k', re.U | re.I)) 542 assert u(r'\u017f').upper() == u'S' # 'ſ' 543 self.assertTrue(re.match(ur'[19S]', u(r'\u017f'), re.U | re.I)) 544 self.assertTrue(re.match(ur'[19s]', u(r'\u017f'), re.U | re.I)) 545 self.assertTrue(re.match(u(r'[19\u017f]'), u'S', re.U | re.I)) 546 self.assertTrue(re.match(u(r'[19\u017f]'), u's', re.U | re.I)) 547 548 def test_ignore_case_range(self): 549 # Issues #3511, #17381. 550 self.assertTrue(re.match(r'[9-a]', '_', re.I)) 551 self.assertIsNone(re.match(r'[9-A]', '_', re.I)) 552 self.assertTrue(re.match(r'[\xc0-\xde]', '\xd7', re.I)) 553 self.assertIsNone(re.match(r'[\xc0-\xde]', '\xf7', re.I)) 554 self.assertTrue(re.match(r'[\xe0-\xfe]', '\xf7',re.I)) 555 self.assertIsNone(re.match(r'[\xe0-\xfe]', '\xd7', re.I)) 556 if have_unicode: 557 self.assertTrue(re.match(u(r'[9-a]'), u(r'_'), re.U | re.I)) 558 self.assertIsNone(re.match(u(r'[9-A]'), u(r'_'), re.U | re.I)) 559 self.assertTrue(re.match(u(r'[\xc0-\xde]'), 560 u(r'\xd7'), re.U | re.I)) 561 self.assertIsNone(re.match(u(r'[\xc0-\xde]'), 562 u(r'\xf7'), re.U | re.I)) 563 self.assertTrue(re.match(u(r'[\xe0-\xfe]'), 564 u(r'\xf7'), re.U | re.I)) 565 self.assertIsNone(re.match(u(r'[\xe0-\xfe]'), 566 u(r'\xd7'), re.U | re.I)) 567 self.assertTrue(re.match(u(r'[\u0430-\u045f]'), 568 u(r'\u0450'), re.U | re.I)) 569 self.assertTrue(re.match(u(r'[\u0430-\u045f]'), 570 u(r'\u0400'), re.U | re.I)) 571 self.assertTrue(re.match(u(r'[\u0400-\u042f]'), 572 u(r'\u0450'), re.U | re.I)) 573 self.assertTrue(re.match(u(r'[\u0400-\u042f]'), 574 u(r'\u0400'), re.U | re.I)) 575 if sys.maxunicode > 0xffff: 576 self.assertTrue(re.match(u(r'[\U00010428-\U0001044f]'), 577 u(r'\U00010428'), re.U | re.I)) 578 self.assertTrue(re.match(u(r'[\U00010428-\U0001044f]'), 579 u(r'\U00010400'), re.U | re.I)) 580 self.assertTrue(re.match(u(r'[\U00010400-\U00010427]'), 581 u(r'\U00010428'), re.U | re.I)) 582 self.assertTrue(re.match(u(r'[\U00010400-\U00010427]'), 583 u(r'\U00010400'), re.U | re.I)) 584 585 assert u(r'\u212a').lower() == u'k' # 'K' 586 self.assertTrue(re.match(ur'[J-M]', u(r'\u212a'), re.U | re.I)) 587 self.assertTrue(re.match(ur'[j-m]', u(r'\u212a'), re.U | re.I)) 588 self.assertTrue(re.match(u(r'[\u2129-\u212b]'), u'K', re.U | re.I)) 589 self.assertTrue(re.match(u(r'[\u2129-\u212b]'), u'k', re.U | re.I)) 590 assert u(r'\u017f').upper() == u'S' # 'ſ' 591 self.assertTrue(re.match(ur'[R-T]', u(r'\u017f'), re.U | re.I)) 592 self.assertTrue(re.match(ur'[r-t]', u(r'\u017f'), re.U | re.I)) 593 self.assertTrue(re.match(u(r'[\u017e-\u0180]'), u'S', re.U | re.I)) 594 self.assertTrue(re.match(u(r'[\u017e-\u0180]'), u's', re.U | re.I)) 595 596 def test_category(self): 597 self.assertEqual(re.match(r"(\s)", " ").group(1), " ") 598 599 def test_getlower(self): 600 import _sre 601 self.assertEqual(_sre.getlower(ord('A'), 0), ord('a')) 602 self.assertEqual(_sre.getlower(ord('A'), re.LOCALE), ord('a')) 603 if have_unicode: 604 self.assertEqual(_sre.getlower(ord('A'), re.UNICODE), ord('a')) 605 606 self.assertEqual(re.match("abc", "ABC", re.I).group(0), "ABC") 607 self.assertEqual(re.match("abc", u"ABC", re.I).group(0), "ABC") 608 609 def test_not_literal(self): 610 self.assertEqual(re.search("\s([^a])", " b").group(1), "b") 611 self.assertEqual(re.search("\s([^a]*)", " bb").group(1), "bb") 612 613 def test_search_coverage(self): 614 self.assertEqual(re.search("\s(b)", " b").group(1), "b") 615 self.assertEqual(re.search("a\s", "a ").group(0), "a ") 616 617 def assertMatch(self, pattern, text, match=None, span=None, 618 matcher=re.match): 619 if match is None and span is None: 620 # the pattern matches the whole text 621 match = text 622 span = (0, len(text)) 623 elif match is None or span is None: 624 raise ValueError('If match is not None, span should be specified ' 625 '(and vice versa).') 626 m = matcher(pattern, text) 627 self.assertTrue(m) 628 self.assertEqual(m.group(), match) 629 self.assertEqual(m.span(), span) 630 631 @requires_unicode 632 def test_re_escape(self): 633 alnum_chars = unicode(string.ascii_letters + string.digits) 634 p = u''.join(unichr(i) for i in range(256)) 635 for c in p: 636 if c in alnum_chars: 637 self.assertEqual(re.escape(c), c) 638 elif c == u'\x00': 639 self.assertEqual(re.escape(c), u'\\000') 640 else: 641 self.assertEqual(re.escape(c), u'\\' + c) 642 self.assertMatch(re.escape(c), c) 643 self.assertMatch(re.escape(p), p) 644 645 def test_re_escape_byte(self): 646 alnum_chars = string.ascii_letters + string.digits 647 p = ''.join(chr(i) for i in range(256)) 648 for b in p: 649 if b in alnum_chars: 650 self.assertEqual(re.escape(b), b) 651 elif b == b'\x00': 652 self.assertEqual(re.escape(b), b'\\000') 653 else: 654 self.assertEqual(re.escape(b), b'\\' + b) 655 self.assertMatch(re.escape(b), b) 656 self.assertMatch(re.escape(p), p) 657 658 @requires_unicode 659 def test_re_escape_non_ascii(self): 660 s = u(r'xxx\u2620\u2620\u2620xxx') 661 s_escaped = re.escape(s) 662 self.assertEqual(s_escaped, u(r'xxx\\\u2620\\\u2620\\\u2620xxx')) 663 self.assertMatch(s_escaped, s) 664 self.assertMatch(u'.%s+.' % re.escape(unichr(0x2620)), s, 665 u(r'x\u2620\u2620\u2620x'), (2, 7), re.search) 666 667 def test_re_escape_non_ascii_bytes(self): 668 b = b'y\xe2\x98\xa0y\xe2\x98\xa0y' 669 b_escaped = re.escape(b) 670 self.assertEqual(b_escaped, b'y\\\xe2\\\x98\\\xa0y\\\xe2\\\x98\\\xa0y') 671 self.assertMatch(b_escaped, b) 672 res = re.findall(re.escape(b'\xe2\x98\xa0'), b) 673 self.assertEqual(len(res), 2) 674 675 def test_pickling(self): 676 import pickle 677 self.pickle_test(pickle) 678 import cPickle 679 self.pickle_test(cPickle) 680 # old pickles expect the _compile() reconstructor in sre module 681 import_module("sre", deprecated=True) 682 from sre import _compile 683 # current pickle expects the _compile() reconstructor in re module 684 from re import _compile 685 686 def pickle_test(self, pickle): 687 oldpat = re.compile('a(?:b|(c|e){1,2}?|d)+?(.)') 688 for proto in range(pickle.HIGHEST_PROTOCOL + 1): 689 pickled = pickle.dumps(oldpat, proto) 690 newpat = pickle.loads(pickled) 691 self.assertEqual(newpat, oldpat) 692 693 def test_constants(self): 694 self.assertEqual(re.I, re.IGNORECASE) 695 self.assertEqual(re.L, re.LOCALE) 696 self.assertEqual(re.M, re.MULTILINE) 697 self.assertEqual(re.S, re.DOTALL) 698 self.assertEqual(re.X, re.VERBOSE) 699 700 def test_flags(self): 701 for flag in [re.I, re.M, re.X, re.S, re.L]: 702 self.assertTrue(re.compile('^pattern$', flag)) 703 704 def test_sre_character_literals(self): 705 for i in [0, 8, 16, 32, 64, 127, 128, 255]: 706 self.assertTrue(re.match(r"\%03o" % i, chr(i))) 707 self.assertTrue(re.match(r"\%03o0" % i, chr(i)+"0")) 708 self.assertTrue(re.match(r"\%03o8" % i, chr(i)+"8")) 709 self.assertTrue(re.match(r"\x%02x" % i, chr(i))) 710 self.assertTrue(re.match(r"\x%02x0" % i, chr(i)+"0")) 711 self.assertTrue(re.match(r"\x%02xz" % i, chr(i)+"z")) 712 self.assertRaises(re.error, re.match, "\911", "") 713 714 def test_sre_character_class_literals(self): 715 for i in [0, 8, 16, 32, 64, 127, 128, 255]: 716 self.assertTrue(re.match(r"[\%03o]" % i, chr(i))) 717 self.assertTrue(re.match(r"[\%03o0]" % i, chr(i))) 718 self.assertTrue(re.match(r"[\%03o8]" % i, chr(i))) 719 self.assertTrue(re.match(r"[\x%02x]" % i, chr(i))) 720 self.assertTrue(re.match(r"[\x%02x0]" % i, chr(i))) 721 self.assertTrue(re.match(r"[\x%02xz]" % i, chr(i))) 722 self.assertRaises(re.error, re.match, "[\911]", "") 723 724 def test_bug_113254(self): 725 self.assertEqual(re.match(r'(a)|(b)', 'b').start(1), -1) 726 self.assertEqual(re.match(r'(a)|(b)', 'b').end(1), -1) 727 self.assertEqual(re.match(r'(a)|(b)', 'b').span(1), (-1, -1)) 728 729 def test_bug_527371(self): 730 # bug described in patches 527371/672491 731 self.assertIsNone(re.match(r'(a)?a','a').lastindex) 732 self.assertEqual(re.match(r'(a)(b)?b','ab').lastindex, 1) 733 self.assertEqual(re.match(r'(?P<a>a)(?P<b>b)?b','ab').lastgroup, 'a') 734 self.assertEqual(re.match("(?P<a>a(b))", "ab").lastgroup, 'a') 735 self.assertEqual(re.match("((a))", "a").lastindex, 1) 736 737 def test_bug_545855(self): 738 # bug 545855 -- This pattern failed to cause a compile error as it 739 # should, instead provoking a TypeError. 740 self.assertRaises(re.error, re.compile, 'foo[a-') 741 742 def test_bug_418626(self): 743 # bugs 418626 at al. -- Testing Greg Chapman's addition of op code 744 # SRE_OP_MIN_REPEAT_ONE for eliminating recursion on simple uses of 745 # pattern '*?' on a long string. 746 self.assertEqual(re.match('.*?c', 10000*'ab'+'cd').end(0), 20001) 747 self.assertEqual(re.match('.*?cd', 5000*'ab'+'c'+5000*'ab'+'cde').end(0), 748 20003) 749 self.assertEqual(re.match('.*?cd', 20000*'abc'+'de').end(0), 60001) 750 # non-simple '*?' still used to hit the recursion limit, before the 751 # non-recursive scheme was implemented. 752 self.assertEqual(re.search('(a|b)*?c', 10000*'ab'+'cd').end(0), 20001) 753 754 @requires_unicode 755 def test_bug_612074(self): 756 pat=u"["+re.escape(unichr(0x2039))+u"]" 757 self.assertEqual(re.compile(pat) and 1, 1) 758 759 def test_stack_overflow(self): 760 # nasty cases that used to overflow the straightforward recursive 761 # implementation of repeated groups. 762 self.assertEqual(re.match('(x)*', 50000*'x').group(1), 'x') 763 self.assertEqual(re.match('(x)*y', 50000*'x'+'y').group(1), 'x') 764 self.assertEqual(re.match('(x)*?y', 50000*'x'+'y').group(1), 'x') 765 766 def test_unlimited_zero_width_repeat(self): 767 # Issue #9669 768 self.assertIsNone(re.match(r'(?:a?)*y', 'z')) 769 self.assertIsNone(re.match(r'(?:a?)+y', 'z')) 770 self.assertIsNone(re.match(r'(?:a?){2,}y', 'z')) 771 self.assertIsNone(re.match(r'(?:a?)*?y', 'z')) 772 self.assertIsNone(re.match(r'(?:a?)+?y', 'z')) 773 self.assertIsNone(re.match(r'(?:a?){2,}?y', 'z')) 774 775 def test_scanner(self): 776 def s_ident(scanner, token): return token 777 def s_operator(scanner, token): return "op%s" % token 778 def s_float(scanner, token): return float(token) 779 def s_int(scanner, token): return int(token) 780 781 scanner = Scanner([ 782 (r"[a-zA-Z_]\w*", s_ident), 783 (r"\d+\.\d*", s_float), 784 (r"\d+", s_int), 785 (r"=|\+|-|\*|/", s_operator), 786 (r"\s+", None), 787 ]) 788 789 self.assertTrue(scanner.scanner.scanner("").pattern) 790 791 self.assertEqual(scanner.scan("sum = 3*foo + 312.50 + bar"), 792 (['sum', 'op=', 3, 'op*', 'foo', 'op+', 312.5, 793 'op+', 'bar'], '')) 794 795 def test_bug_448951(self): 796 # bug 448951 (similar to 429357, but with single char match) 797 # (Also test greedy matches.) 798 for op in '','?','*': 799 self.assertEqual(re.match(r'((.%s):)?z'%op, 'z').groups(), 800 (None, None)) 801 self.assertEqual(re.match(r'((.%s):)?z'%op, 'a:z').groups(), 802 ('a:', 'a')) 803 804 def test_bug_725106(self): 805 # capturing groups in alternatives in repeats 806 self.assertEqual(re.match('^((a)|b)*', 'abc').groups(), 807 ('b', 'a')) 808 self.assertEqual(re.match('^(([ab])|c)*', 'abc').groups(), 809 ('c', 'b')) 810 self.assertEqual(re.match('^((d)|[ab])*', 'abc').groups(), 811 ('b', None)) 812 self.assertEqual(re.match('^((a)c|[ab])*', 'abc').groups(), 813 ('b', None)) 814 self.assertEqual(re.match('^((a)|b)*?c', 'abc').groups(), 815 ('b', 'a')) 816 self.assertEqual(re.match('^(([ab])|c)*?d', 'abcd').groups(), 817 ('c', 'b')) 818 self.assertEqual(re.match('^((d)|[ab])*?c', 'abc').groups(), 819 ('b', None)) 820 self.assertEqual(re.match('^((a)c|[ab])*?c', 'abc').groups(), 821 ('b', None)) 822 823 def test_bug_725149(self): 824 # mark_stack_base restoring before restoring marks 825 self.assertEqual(re.match('(a)(?:(?=(b)*)c)*', 'abb').groups(), 826 ('a', None)) 827 self.assertEqual(re.match('(a)((?!(b)*))*', 'abb').groups(), 828 ('a', None, None)) 829 830 @requires_unicode 831 def test_bug_764548(self): 832 # bug 764548, re.compile() barfs on str/unicode subclasses 833 class my_unicode(unicode): pass 834 pat = re.compile(my_unicode("abc")) 835 self.assertIsNone(pat.match("xyz")) 836 837 def test_finditer(self): 838 iter = re.finditer(r":+", "a:b::c:::d") 839 self.assertEqual([item.group(0) for item in iter], 840 [":", "::", ":::"]) 841 842 @requires_unicode 843 def test_bug_926075(self): 844 self.assertIsNot(re.compile('bug_926075'), 845 re.compile(u'bug_926075')) 846 847 @requires_unicode 848 def test_bug_931848(self): 849 pattern = u(r"[\u002E\u3002\uFF0E\uFF61]") 850 self.assertEqual(re.compile(pattern).split("a.b.c"), 851 ['a','b','c']) 852 853 def test_bug_581080(self): 854 iter = re.finditer(r"\s", "a b") 855 self.assertEqual(iter.next().span(), (1,2)) 856 self.assertRaises(StopIteration, iter.next) 857 858 scanner = re.compile(r"\s").scanner("a b") 859 self.assertEqual(scanner.search().span(), (1, 2)) 860 self.assertIsNone(scanner.search()) 861 862 def test_bug_817234(self): 863 iter = re.finditer(r".*", "asdf") 864 self.assertEqual(iter.next().span(), (0, 4)) 865 self.assertEqual(iter.next().span(), (4, 4)) 866 self.assertRaises(StopIteration, iter.next) 867 868 @requires_unicode 869 def test_bug_6561(self): 870 # '\d' should match characters in Unicode category 'Nd' 871 # (Number, Decimal Digit), but not those in 'Nl' (Number, 872 # Letter) or 'No' (Number, Other). 873 decimal_digits = [ 874 unichr(0x0037), # '\N{DIGIT SEVEN}', category 'Nd' 875 unichr(0x0e58), # '\N{THAI DIGIT SIX}', category 'Nd' 876 unichr(0xff10), # '\N{FULLWIDTH DIGIT ZERO}', category 'Nd' 877 ] 878 for x in decimal_digits: 879 self.assertEqual(re.match('^\d$', x, re.UNICODE).group(0), x) 880 881 not_decimal_digits = [ 882 unichr(0x2165), # '\N{ROMAN NUMERAL SIX}', category 'Nl' 883 unichr(0x3039), # '\N{HANGZHOU NUMERAL TWENTY}', category 'Nl' 884 unichr(0x2082), # '\N{SUBSCRIPT TWO}', category 'No' 885 unichr(0x32b4), # '\N{CIRCLED NUMBER THIRTY NINE}', category 'No' 886 ] 887 for x in not_decimal_digits: 888 self.assertIsNone(re.match('^\d$', x, re.UNICODE)) 889 890 def test_empty_array(self): 891 # SF buf 1647541 892 import array 893 typecodes = 'cbBhHiIlLfd' 894 if have_unicode: 895 typecodes += 'u' 896 for typecode in typecodes: 897 a = array.array(typecode) 898 self.assertIsNone(re.compile("bla").match(a)) 899 self.assertEqual(re.compile("").match(a).groups(), ()) 900 901 @requires_unicode 902 def test_inline_flags(self): 903 # Bug #1700 904 upper_char = unichr(0x1ea0) # Latin Capital Letter A with Dot Bellow 905 lower_char = unichr(0x1ea1) # Latin Small Letter A with Dot Bellow 906 907 p = re.compile(upper_char, re.I | re.U) 908 q = p.match(lower_char) 909 self.assertTrue(q) 910 911 p = re.compile(lower_char, re.I | re.U) 912 q = p.match(upper_char) 913 self.assertTrue(q) 914 915 p = re.compile('(?i)' + upper_char, re.U) 916 q = p.match(lower_char) 917 self.assertTrue(q) 918 919 p = re.compile('(?i)' + lower_char, re.U) 920 q = p.match(upper_char) 921 self.assertTrue(q) 922 923 p = re.compile('(?iu)' + upper_char) 924 q = p.match(lower_char) 925 self.assertTrue(q) 926 927 p = re.compile('(?iu)' + lower_char) 928 q = p.match(upper_char) 929 self.assertTrue(q) 930 931 self.assertTrue(re.match('(?ixu) ' + upper_char, lower_char)) 932 self.assertTrue(re.match('(?ixu) ' + lower_char, upper_char)) 933 934 def test_dollar_matches_twice(self): 935 "$ matches the end of string, and just before the terminating \n" 936 pattern = re.compile('$') 937 self.assertEqual(pattern.sub('#', 'a\nb\n'), 'a\nb#\n#') 938 self.assertEqual(pattern.sub('#', 'a\nb\nc'), 'a\nb\nc#') 939 self.assertEqual(pattern.sub('#', '\n'), '#\n#') 940 941 pattern = re.compile('$', re.MULTILINE) 942 self.assertEqual(pattern.sub('#', 'a\nb\n' ), 'a#\nb#\n#' ) 943 self.assertEqual(pattern.sub('#', 'a\nb\nc'), 'a#\nb#\nc#') 944 self.assertEqual(pattern.sub('#', '\n'), '#\n#') 945 946 def test_dealloc(self): 947 # issue 3299: check for segfault in debug build 948 import _sre 949 # the overflow limit is different on wide and narrow builds and it 950 # depends on the definition of SRE_CODE (see sre.h). 951 # 2**128 should be big enough to overflow on both. For smaller values 952 # a RuntimeError is raised instead of OverflowError. 953 long_overflow = 2**128 954 self.assertRaises(TypeError, re.finditer, "a", {}) 955 self.assertRaises(OverflowError, _sre.compile, "abc", 0, [long_overflow]) 956 957 def test_compile(self): 958 # Test return value when given string and pattern as parameter 959 pattern = re.compile('random pattern') 960 self.assertIsInstance(pattern, re._pattern_type) 961 same_pattern = re.compile(pattern) 962 self.assertIsInstance(same_pattern, re._pattern_type) 963 self.assertIs(same_pattern, pattern) 964 # Test behaviour when not given a string or pattern as parameter 965 self.assertRaises(TypeError, re.compile, 0) 966 967 def test_bug_13899(self): 968 # Issue #13899: re pattern r"[\A]" should work like "A" but matches 969 # nothing. Ditto B and Z. 970 self.assertEqual(re.findall(r'[\A\B\b\C\Z]', 'AB\bCZ'), 971 ['A', 'B', '\b', 'C', 'Z']) 972 973 @precisionbigmemtest(size=_2G, memuse=1) 974 def test_large_search(self, size): 975 # Issue #10182: indices were 32-bit-truncated. 976 s = 'a' * size 977 m = re.search('$', s) 978 self.assertIsNotNone(m) 979 self.assertEqual(m.start(), size) 980 self.assertEqual(m.end(), size) 981 982 # The huge memuse is because of re.sub() using a list and a join() 983 # to create the replacement result. 984 @precisionbigmemtest(size=_2G, memuse=16 + 2) 985 def test_large_subn(self, size): 986 # Issue #10182: indices were 32-bit-truncated. 987 s = 'a' * size 988 r, n = re.subn('', '', s) 989 self.assertEqual(r, s) 990 self.assertEqual(n, size + 1) 991 992 993 def test_repeat_minmax_overflow(self): 994 # Issue #13169 995 string = "x" * 100000 996 self.assertEqual(re.match(r".{65535}", string).span(), (0, 65535)) 997 self.assertEqual(re.match(r".{,65535}", string).span(), (0, 65535)) 998 self.assertEqual(re.match(r".{65535,}?", string).span(), (0, 65535)) 999 self.assertEqual(re.match(r".{65536}", string).span(), (0, 65536)) 1000 self.assertEqual(re.match(r".{,65536}", string).span(), (0, 65536)) 1001 self.assertEqual(re.match(r".{65536,}?", string).span(), (0, 65536)) 1002 # 2**128 should be big enough to overflow both SRE_CODE and Py_ssize_t. 1003 self.assertRaises(OverflowError, re.compile, r".{%d}" % 2**128) 1004 self.assertRaises(OverflowError, re.compile, r".{,%d}" % 2**128) 1005 self.assertRaises(OverflowError, re.compile, r".{%d,}?" % 2**128) 1006 self.assertRaises(OverflowError, re.compile, r".{%d,%d}" % (2**129, 2**128)) 1007 1008 @cpython_only 1009 def test_repeat_minmax_overflow_maxrepeat(self): 1010 try: 1011 from _sre import MAXREPEAT 1012 except ImportError: 1013 self.skipTest('requires _sre.MAXREPEAT constant') 1014 string = "x" * 100000 1015 self.assertIsNone(re.match(r".{%d}" % (MAXREPEAT - 1), string)) 1016 self.assertEqual(re.match(r".{,%d}" % (MAXREPEAT - 1), string).span(), 1017 (0, 100000)) 1018 self.assertIsNone(re.match(r".{%d,}?" % (MAXREPEAT - 1), string)) 1019 self.assertRaises(OverflowError, re.compile, r".{%d}" % MAXREPEAT) 1020 self.assertRaises(OverflowError, re.compile, r".{,%d}" % MAXREPEAT) 1021 self.assertRaises(OverflowError, re.compile, r".{%d,}?" % MAXREPEAT) 1022 1023 def test_backref_group_name_in_exception(self): 1024 # Issue 17341: Poor error message when compiling invalid regex 1025 with self.assertRaisesRegexp(sre_constants.error, '<foo>'): 1026 re.compile('(?P=<foo>)') 1027 1028 def test_group_name_in_exception(self): 1029 # Issue 17341: Poor error message when compiling invalid regex 1030 with self.assertRaisesRegexp(sre_constants.error, '\?foo'): 1031 re.compile('(?P<?foo>)') 1032 1033 def test_issue17998(self): 1034 for reps in '*', '+', '?', '{1}': 1035 for mod in '', '?': 1036 pattern = '.' + reps + mod + 'yz' 1037 self.assertEqual(re.compile(pattern, re.S).findall('xyz'), 1038 ['xyz'], msg=pattern) 1039 if have_unicode: 1040 pattern = unicode(pattern) 1041 self.assertEqual(re.compile(pattern, re.S).findall(u'xyz'), 1042 [u'xyz'], msg=pattern) 1043 1044 1045 def test_bug_2537(self): 1046 # issue 2537: empty submatches 1047 for outer_op in ('{0,}', '*', '+', '{1,187}'): 1048 for inner_op in ('{0,}', '*', '?'): 1049 r = re.compile("^((x|y)%s)%s" % (inner_op, outer_op)) 1050 m = r.match("xyyzy") 1051 self.assertEqual(m.group(0), "xyy") 1052 self.assertEqual(m.group(1), "") 1053 self.assertEqual(m.group(2), "y") 1054 1055 def test_debug_flag(self): 1056 pat = r'(\.)(?:[ch]|py)(?(1)$|: )' 1057 with captured_stdout() as out: 1058 re.compile(pat, re.DEBUG) 1059 dump = '''\ 1060subpattern 1 1061 literal 46 1062subpattern None 1063 branch 1064 in 1065 literal 99 1066 literal 104 1067 or 1068 literal 112 1069 literal 121 1070subpattern None 1071 groupref_exists 1 1072 at at_end 1073 else 1074 literal 58 1075 literal 32 1076''' 1077 self.assertEqual(out.getvalue(), dump) 1078 # Debug output is output again even a second time (bypassing 1079 # the cache -- issue #20426). 1080 with captured_stdout() as out: 1081 re.compile(pat, re.DEBUG) 1082 self.assertEqual(out.getvalue(), dump) 1083 1084 def test_keyword_parameters(self): 1085 # Issue #20283: Accepting the string keyword parameter. 1086 pat = re.compile(r'(ab)') 1087 self.assertEqual( 1088 pat.match(string='abracadabra', pos=7, endpos=10).span(), (7, 9)) 1089 self.assertEqual( 1090 pat.search(string='abracadabra', pos=3, endpos=10).span(), (7, 9)) 1091 self.assertEqual( 1092 pat.findall(string='abracadabra', pos=3, endpos=10), ['ab']) 1093 self.assertEqual( 1094 pat.split(string='abracadabra', maxsplit=1), 1095 ['', 'ab', 'racadabra']) 1096 1097 def test_match_group_takes_long(self): 1098 self.assertEqual(re.match("(foo)", "foo").group(1L), "foo") 1099 self.assertRaises(IndexError, re.match("", "").group, sys.maxint + 1) 1100 1101 def test_locale_caching(self): 1102 # Issue #22410 1103 oldlocale = locale.setlocale(locale.LC_CTYPE) 1104 self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale) 1105 for loc in 'en_US.iso88591', 'en_US.utf8': 1106 try: 1107 locale.setlocale(locale.LC_CTYPE, loc) 1108 except locale.Error: 1109 # Unsupported locale on this system 1110 self.skipTest('test needs %s locale' % loc) 1111 1112 re.purge() 1113 self.check_en_US_iso88591() 1114 self.check_en_US_utf8() 1115 re.purge() 1116 self.check_en_US_utf8() 1117 self.check_en_US_iso88591() 1118 1119 def check_en_US_iso88591(self): 1120 locale.setlocale(locale.LC_CTYPE, 'en_US.iso88591') 1121 self.assertTrue(re.match(b'\xc5\xe5', b'\xc5\xe5', re.L|re.I)) 1122 self.assertTrue(re.match(b'\xc5', b'\xe5', re.L|re.I)) 1123 self.assertTrue(re.match(b'\xe5', b'\xc5', re.L|re.I)) 1124 self.assertTrue(re.match(b'(?Li)\xc5\xe5', b'\xc5\xe5')) 1125 self.assertTrue(re.match(b'(?Li)\xc5', b'\xe5')) 1126 self.assertTrue(re.match(b'(?Li)\xe5', b'\xc5')) 1127 1128 def check_en_US_utf8(self): 1129 locale.setlocale(locale.LC_CTYPE, 'en_US.utf8') 1130 self.assertTrue(re.match(b'\xc5\xe5', b'\xc5\xe5', re.L|re.I)) 1131 self.assertIsNone(re.match(b'\xc5', b'\xe5', re.L|re.I)) 1132 self.assertIsNone(re.match(b'\xe5', b'\xc5', re.L|re.I)) 1133 self.assertTrue(re.match(b'(?Li)\xc5\xe5', b'\xc5\xe5')) 1134 self.assertIsNone(re.match(b'(?Li)\xc5', b'\xe5')) 1135 self.assertIsNone(re.match(b'(?Li)\xe5', b'\xc5')) 1136 1137 1138def run_re_tests(): 1139 from test.re_tests import tests, SUCCEED, FAIL, SYNTAX_ERROR 1140 if verbose: 1141 print 'Running re_tests test suite' 1142 else: 1143 # To save time, only run the first and last 10 tests 1144 #tests = tests[:10] + tests[-10:] 1145 pass 1146 1147 for t in tests: 1148 sys.stdout.flush() 1149 pattern = s = outcome = repl = expected = None 1150 if len(t) == 5: 1151 pattern, s, outcome, repl, expected = t 1152 elif len(t) == 3: 1153 pattern, s, outcome = t 1154 else: 1155 raise ValueError, ('Test tuples should have 3 or 5 fields', t) 1156 1157 try: 1158 obj = re.compile(pattern) 1159 except re.error: 1160 if outcome == SYNTAX_ERROR: pass # Expected a syntax error 1161 else: 1162 print '=== Syntax error:', t 1163 except KeyboardInterrupt: raise KeyboardInterrupt 1164 except: 1165 print '*** Unexpected error ***', t 1166 if verbose: 1167 traceback.print_exc(file=sys.stdout) 1168 else: 1169 try: 1170 result = obj.search(s) 1171 except re.error, msg: 1172 print '=== Unexpected exception', t, repr(msg) 1173 if outcome == SYNTAX_ERROR: 1174 # This should have been a syntax error; forget it. 1175 pass 1176 elif outcome == FAIL: 1177 if result is None: pass # No match, as expected 1178 else: print '=== Succeeded incorrectly', t 1179 elif outcome == SUCCEED: 1180 if result is not None: 1181 # Matched, as expected, so now we compute the 1182 # result string and compare it to our expected result. 1183 start, end = result.span(0) 1184 vardict={'found': result.group(0), 1185 'groups': result.group(), 1186 'flags': result.re.flags} 1187 for i in range(1, 100): 1188 try: 1189 gi = result.group(i) 1190 # Special hack because else the string concat fails: 1191 if gi is None: 1192 gi = "None" 1193 except IndexError: 1194 gi = "Error" 1195 vardict['g%d' % i] = gi 1196 for i in result.re.groupindex.keys(): 1197 try: 1198 gi = result.group(i) 1199 if gi is None: 1200 gi = "None" 1201 except IndexError: 1202 gi = "Error" 1203 vardict[i] = gi 1204 repl = eval(repl, vardict) 1205 if repl != expected: 1206 print '=== grouping error', t, 1207 print repr(repl) + ' should be ' + repr(expected) 1208 else: 1209 print '=== Failed incorrectly', t 1210 1211 # Try the match on a unicode string, and check that it 1212 # still succeeds. 1213 try: 1214 result = obj.search(unicode(s, "latin-1")) 1215 if result is None: 1216 print '=== Fails on unicode match', t 1217 except NameError: 1218 continue # 1.5.2 1219 except TypeError: 1220 continue # unicode test case 1221 1222 # Try the match on a unicode pattern, and check that it 1223 # still succeeds. 1224 obj=re.compile(unicode(pattern, "latin-1")) 1225 result = obj.search(s) 1226 if result is None: 1227 print '=== Fails on unicode pattern match', t 1228 1229 # Try the match with the search area limited to the extent 1230 # of the match and see if it still succeeds. \B will 1231 # break (because it won't match at the end or start of a 1232 # string), so we'll ignore patterns that feature it. 1233 1234 if pattern[:2] != '\\B' and pattern[-2:] != '\\B' \ 1235 and result is not None: 1236 obj = re.compile(pattern) 1237 result = obj.search(s, result.start(0), result.end(0) + 1) 1238 if result is None: 1239 print '=== Failed on range-limited match', t 1240 1241 # Try the match with IGNORECASE enabled, and check that it 1242 # still succeeds. 1243 obj = re.compile(pattern, re.IGNORECASE) 1244 result = obj.search(s) 1245 if result is None: 1246 print '=== Fails on case-insensitive match', t 1247 1248 # Try the match with LOCALE enabled, and check that it 1249 # still succeeds. 1250 obj = re.compile(pattern, re.LOCALE) 1251 result = obj.search(s) 1252 if result is None: 1253 print '=== Fails on locale-sensitive match', t 1254 1255 # Try the match with UNICODE locale enabled, and check 1256 # that it still succeeds. 1257 obj = re.compile(pattern, re.UNICODE) 1258 result = obj.search(s) 1259 if result is None: 1260 print '=== Fails on unicode-sensitive match', t 1261 1262def test_main(): 1263 run_unittest(ReTests) 1264 run_re_tests() 1265 1266if __name__ == "__main__": 1267 test_main() 1268