1# -*- coding: utf-8 -*- 2from test.test_support import ( 3 verbose, run_unittest, import_module, 4 precisionbigmemtest, _2G, cpython_only, 5 captured_stdout, have_unicode, requires_unicode, u, 6 check_warnings, check_py3k_warnings) 7import locale 8import re 9from re import Scanner 10import sre_constants 11import sys 12import string 13import traceback 14from weakref import proxy 15 16 17# Misc tests from Tim Peters' re.doc 18 19# WARNING: Don't change details in these tests if you don't know 20# what you're doing. Some of these tests were carefully modeled to 21# cover most of the code. 22 23import unittest 24 25class ReTests(unittest.TestCase): 26 27 def test_weakref(self): 28 s = 'QabbbcR' 29 x = re.compile('ab+c') 30 y = proxy(x) 31 self.assertEqual(x.findall('QabbbcR'), y.findall('QabbbcR')) 32 33 def test_search_star_plus(self): 34 self.assertEqual(re.search('x*', 'axx').span(0), (0, 0)) 35 self.assertEqual(re.search('x*', 'axx').span(), (0, 0)) 36 self.assertEqual(re.search('x+', 'axx').span(0), (1, 3)) 37 self.assertEqual(re.search('x+', 'axx').span(), (1, 3)) 38 self.assertIsNone(re.search('x', 'aaa')) 39 self.assertEqual(re.match('a*', 'xxx').span(0), (0, 0)) 40 self.assertEqual(re.match('a*', 'xxx').span(), (0, 0)) 41 self.assertEqual(re.match('x*', 'xxxa').span(0), (0, 3)) 42 self.assertEqual(re.match('x*', 'xxxa').span(), (0, 3)) 43 self.assertIsNone(re.match('a+', 'xxx')) 44 45 def bump_num(self, matchobj): 46 int_value = int(matchobj.group(0)) 47 return str(int_value + 1) 48 49 def test_basic_re_sub(self): 50 self.assertEqual(re.sub("(?i)b+", "x", "bbbb BBBB"), 'x x') 51 self.assertEqual(re.sub(r'\d+', self.bump_num, '08.2 -2 23x99y'), 52 '9.3 -3 24x100y') 53 self.assertEqual(re.sub(r'\d+', self.bump_num, '08.2 -2 23x99y', 3), 54 '9.3 -3 23x99y') 55 56 self.assertEqual(re.sub('.', lambda m: r"\n", 'x'), '\\n') 57 self.assertEqual(re.sub('.', r"\n", 'x'), '\n') 58 59 s = r"\1\1" 60 self.assertEqual(re.sub('(.)', s, 'x'), 'xx') 61 self.assertEqual(re.sub('(.)', s.replace('\\', r'\\'), 'x'), s) 62 self.assertEqual(re.sub('(.)', lambda m: s, 'x'), s) 63 64 self.assertEqual(re.sub('(?P<a>x)', '\g<a>\g<a>', 'xx'), 'xxxx') 65 self.assertEqual(re.sub('(?P<a>x)', '\g<a>\g<1>', 'xx'), 'xxxx') 66 self.assertEqual(re.sub('(?P<unk>x)', '\g<unk>\g<unk>', 'xx'), 'xxxx') 67 self.assertEqual(re.sub('(?P<unk>x)', '\g<1>\g<1>', 'xx'), 'xxxx') 68 69 self.assertEqual(re.sub('a', r'\t\n\v\r\f\a\b', 'a'), '\t\n\v\r\f\a\b') 70 self.assertEqual(re.sub('a', '\t\n\v\r\f\a\b', 'a'), '\t\n\v\r\f\a\b') 71 self.assertEqual(re.sub('a', '\t\n\v\r\f\a\b', 'a'), 72 (chr(9)+chr(10)+chr(11)+chr(13)+chr(12)+chr(7)+chr(8))) 73 for c in 'cdehijklmopqsuwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ': 74 with check_py3k_warnings(): 75 self.assertEqual(re.sub('a', '\\' + c, 'a'), '\\' + c) 76 77 self.assertEqual(re.sub('^\s*', 'X', 'test'), 'Xtest') 78 79 def test_bug_449964(self): 80 # fails for group followed by other escape 81 self.assertEqual(re.sub(r'(?P<unk>x)', '\g<1>\g<1>\\b', 'xx'), 82 'xx\bxx\b') 83 84 def test_bug_449000(self): 85 # Test for sub() on escaped characters 86 self.assertEqual(re.sub(r'\r\n', r'\n', 'abc\r\ndef\r\n'), 87 'abc\ndef\n') 88 self.assertEqual(re.sub('\r\n', r'\n', 'abc\r\ndef\r\n'), 89 'abc\ndef\n') 90 self.assertEqual(re.sub(r'\r\n', '\n', 'abc\r\ndef\r\n'), 91 'abc\ndef\n') 92 self.assertEqual(re.sub('\r\n', '\n', 'abc\r\ndef\r\n'), 93 'abc\ndef\n') 94 95 @requires_unicode 96 def test_bug_1140(self): 97 # re.sub(x, y, u'') should return u'', not '', and 98 # re.sub(x, y, '') should return '', not u''. 99 # Also: 100 # re.sub(x, y, unicode(x)) should return unicode(y), and 101 # re.sub(x, y, str(x)) should return 102 # str(y) if isinstance(y, str) else unicode(y). 103 for x in 'x', u'x': 104 for y in 'y', u'y': 105 z = re.sub(x, y, u'') 106 self.assertEqual(z, u'') 107 self.assertEqual(type(z), unicode) 108 # 109 z = re.sub(x, y, '') 110 self.assertEqual(z, '') 111 self.assertEqual(type(z), str) 112 # 113 z = re.sub(x, y, unicode(x)) 114 self.assertEqual(z, y) 115 self.assertEqual(type(z), unicode) 116 # 117 z = re.sub(x, y, str(x)) 118 self.assertEqual(z, y) 119 self.assertEqual(type(z), type(y)) 120 121 def test_bug_1661(self): 122 # Verify that flags do not get silently ignored with compiled patterns 123 pattern = re.compile('.') 124 self.assertRaises(ValueError, re.match, pattern, 'A', re.I) 125 self.assertRaises(ValueError, re.search, pattern, 'A', re.I) 126 self.assertRaises(ValueError, re.findall, pattern, 'A', re.I) 127 self.assertRaises(ValueError, re.compile, pattern, re.I) 128 129 def test_bug_3629(self): 130 # A regex that triggered a bug in the sre-code validator 131 re.compile("(?P<quote>)(?(quote))") 132 133 def test_sub_template_numeric_escape(self): 134 # bug 776311 and friends 135 self.assertEqual(re.sub('x', r'\0', 'x'), '\0') 136 self.assertEqual(re.sub('x', r'\000', 'x'), '\000') 137 self.assertEqual(re.sub('x', r'\001', 'x'), '\001') 138 self.assertEqual(re.sub('x', r'\008', 'x'), '\0' + '8') 139 self.assertEqual(re.sub('x', r'\009', 'x'), '\0' + '9') 140 self.assertEqual(re.sub('x', r'\111', 'x'), '\111') 141 self.assertEqual(re.sub('x', r'\117', 'x'), '\117') 142 143 self.assertEqual(re.sub('x', r'\1111', 'x'), '\1111') 144 self.assertEqual(re.sub('x', r'\1111', 'x'), '\111' + '1') 145 146 self.assertEqual(re.sub('x', r'\00', 'x'), '\x00') 147 self.assertEqual(re.sub('x', r'\07', 'x'), '\x07') 148 self.assertEqual(re.sub('x', r'\08', 'x'), '\0' + '8') 149 self.assertEqual(re.sub('x', r'\09', 'x'), '\0' + '9') 150 self.assertEqual(re.sub('x', r'\0a', 'x'), '\0' + 'a') 151 152 self.assertEqual(re.sub('x', r'\400', 'x'), '\0') 153 self.assertEqual(re.sub('x', r'\777', 'x'), '\377') 154 155 self.assertRaises(re.error, re.sub, 'x', r'\1', 'x') 156 self.assertRaises(re.error, re.sub, 'x', r'\8', 'x') 157 self.assertRaises(re.error, re.sub, 'x', r'\9', 'x') 158 self.assertRaises(re.error, re.sub, 'x', r'\11', 'x') 159 self.assertRaises(re.error, re.sub, 'x', r'\18', 'x') 160 self.assertRaises(re.error, re.sub, 'x', r'\1a', 'x') 161 self.assertRaises(re.error, re.sub, 'x', r'\90', 'x') 162 self.assertRaises(re.error, re.sub, 'x', r'\99', 'x') 163 self.assertRaises(re.error, re.sub, 'x', r'\118', 'x') # r'\11' + '8' 164 self.assertRaises(re.error, re.sub, 'x', r'\11a', 'x') 165 self.assertRaises(re.error, re.sub, 'x', r'\181', 'x') # r'\18' + '1' 166 self.assertRaises(re.error, re.sub, 'x', r'\800', 'x') # r'\80' + '0' 167 168 # in python2.3 (etc), these loop endlessly in sre_parser.py 169 self.assertEqual(re.sub('(((((((((((x)))))))))))', r'\11', 'x'), 'x') 170 self.assertEqual(re.sub('((((((((((y))))))))))(.)', r'\118', 'xyz'), 171 'xz8') 172 self.assertEqual(re.sub('((((((((((y))))))))))(.)', r'\11a', 'xyz'), 173 'xza') 174 175 def test_qualified_re_sub(self): 176 self.assertEqual(re.sub('a', 'b', 'aaaaa'), 'bbbbb') 177 self.assertEqual(re.sub('a', 'b', 'aaaaa', 1), 'baaaa') 178 179 def test_bug_114660(self): 180 self.assertEqual(re.sub(r'(\S)\s+(\S)', r'\1 \2', 'hello there'), 181 'hello there') 182 183 def test_bug_462270(self): 184 # Test for empty sub() behaviour, see SF bug #462270 185 self.assertEqual(re.sub('x*', '-', 'abxd'), '-a-b-d-') 186 self.assertEqual(re.sub('x+', '-', 'abxd'), 'ab-d') 187 188 def test_symbolic_groups(self): 189 re.compile('(?P<a>x)(?P=a)(?(a)y)') 190 re.compile('(?P<a1>x)(?P=a1)(?(a1)y)') 191 self.assertRaises(re.error, re.compile, '(?P<a>)(?P<a>)') 192 self.assertRaises(re.error, re.compile, '(?Px)') 193 self.assertRaises(re.error, re.compile, '(?P=)') 194 self.assertRaises(re.error, re.compile, '(?P=1)') 195 self.assertRaises(re.error, re.compile, '(?P=a)') 196 self.assertRaises(re.error, re.compile, '(?P=a1)') 197 self.assertRaises(re.error, re.compile, '(?P=a.)') 198 self.assertRaises(re.error, re.compile, '(?P<)') 199 self.assertRaises(re.error, re.compile, '(?P<>)') 200 self.assertRaises(re.error, re.compile, '(?P<1>)') 201 self.assertRaises(re.error, re.compile, '(?P<a.>)') 202 self.assertRaises(re.error, re.compile, '(?())') 203 self.assertRaises(re.error, re.compile, '(?(a))') 204 self.assertRaises(re.error, re.compile, '(?(1a))') 205 self.assertRaises(re.error, re.compile, '(?(a.))') 206 207 def test_symbolic_refs(self): 208 self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<a', 'xx') 209 self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<', 'xx') 210 self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g', 'xx') 211 self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<a a>', 'xx') 212 self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<>', 'xx') 213 self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<1a1>', 'xx') 214 self.assertRaises(IndexError, re.sub, '(?P<a>x)', '\g<ab>', 'xx') 215 self.assertRaises(re.error, re.sub, '(?P<a>x)|(?P<b>y)', '\g<b>', 'xx') 216 self.assertRaises(re.error, re.sub, '(?P<a>x)|(?P<b>y)', '\\2', 'xx') 217 self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<-1>', 'xx') 218 219 def test_re_subn(self): 220 self.assertEqual(re.subn("(?i)b+", "x", "bbbb BBBB"), ('x x', 2)) 221 self.assertEqual(re.subn("b+", "x", "bbbb BBBB"), ('x BBBB', 1)) 222 self.assertEqual(re.subn("b+", "x", "xyz"), ('xyz', 0)) 223 self.assertEqual(re.subn("b*", "x", "xyz"), ('xxxyxzx', 4)) 224 self.assertEqual(re.subn("b*", "x", "xyz", 2), ('xxxyz', 2)) 225 226 def test_re_split(self): 227 self.assertEqual(re.split(":", ":a:b::c"), ['', 'a', 'b', '', 'c']) 228 self.assertEqual(re.split(":+", ":a:b::c"), ['', 'a', 'b', 'c']) 229 self.assertEqual(re.split("(:+)", ":a:b::c"), 230 ['', ':', 'a', ':', 'b', '::', 'c']) 231 self.assertEqual(re.split("(?::+)", ":a:b::c"), ['', 'a', 'b', 'c']) 232 self.assertEqual(re.split("(:)+", ":a:b::c"), 233 ['', ':', 'a', ':', 'b', ':', 'c']) 234 self.assertEqual(re.split("([b:]+)", ":a:b::c"), 235 ['', ':', 'a', ':b::', 'c']) 236 self.assertEqual(re.split("(b)|(:+)", ":a:b::c"), 237 ['', None, ':', 'a', None, ':', '', 'b', None, '', 238 None, '::', 'c']) 239 self.assertEqual(re.split("(?:b)|(?::+)", ":a:b::c"), 240 ['', 'a', '', '', 'c']) 241 242 for sep, expected in [ 243 (':*', ['', 'a', 'b', 'c']), 244 ('(?::*)', ['', 'a', 'b', 'c']), 245 ('(:*)', ['', ':', 'a', ':', 'b', '::', 'c']), 246 ('(:)*', ['', ':', 'a', ':', 'b', ':', 'c']), 247 ]: 248 with check_py3k_warnings(('', FutureWarning)): 249 self.assertEqual(re.split(sep, ':a:b::c'), expected) 250 251 for sep, expected in [ 252 ('', [':a:b::c']), 253 (r'\b', [':a:b::c']), 254 (r'(?=:)', [':a:b::c']), 255 (r'(?<=:)', [':a:b::c']), 256 ]: 257 with check_py3k_warnings(): 258 self.assertEqual(re.split(sep, ':a:b::c'), expected) 259 260 def test_qualified_re_split(self): 261 self.assertEqual(re.split(":", ":a:b::c", 2), ['', 'a', 'b::c']) 262 self.assertEqual(re.split(':', 'a:b:c:d', 2), ['a', 'b', 'c:d']) 263 self.assertEqual(re.split("(:)", ":a:b::c", 2), 264 ['', ':', 'a', ':', 'b::c']) 265 self.assertEqual(re.split("(:+)", ":a:b::c", 2), 266 ['', ':', 'a', ':', 'b::c']) 267 with check_py3k_warnings(('', FutureWarning)): 268 self.assertEqual(re.split("(:*)", ":a:b::c", maxsplit=2), 269 ['', ':', 'a', ':', 'b::c']) 270 271 def test_re_findall(self): 272 self.assertEqual(re.findall(":+", "abc"), []) 273 self.assertEqual(re.findall(":+", "a:b::c:::d"), [":", "::", ":::"]) 274 self.assertEqual(re.findall("(:+)", "a:b::c:::d"), [":", "::", ":::"]) 275 self.assertEqual(re.findall("(:)(:*)", "a:b::c:::d"), [(":", ""), 276 (":", ":"), 277 (":", "::")]) 278 279 def test_bug_117612(self): 280 self.assertEqual(re.findall(r"(a|(b))", "aba"), 281 [("a", ""),("b", "b"),("a", "")]) 282 283 def test_re_match(self): 284 self.assertEqual(re.match('a', 'a').groups(), ()) 285 self.assertEqual(re.match('(a)', 'a').groups(), ('a',)) 286 self.assertEqual(re.match(r'(a)', 'a').group(0), 'a') 287 self.assertEqual(re.match(r'(a)', 'a').group(1), 'a') 288 self.assertEqual(re.match(r'(a)', 'a').group(1, 1), ('a', 'a')) 289 290 pat = re.compile('((a)|(b))(c)?') 291 self.assertEqual(pat.match('a').groups(), ('a', 'a', None, None)) 292 self.assertEqual(pat.match('b').groups(), ('b', None, 'b', None)) 293 self.assertEqual(pat.match('ac').groups(), ('a', 'a', None, 'c')) 294 self.assertEqual(pat.match('bc').groups(), ('b', None, 'b', 'c')) 295 self.assertEqual(pat.match('bc').groups(""), ('b', "", 'b', 'c')) 296 297 # A single group 298 m = re.match('(a)', 'a') 299 self.assertEqual(m.group(0), 'a') 300 self.assertEqual(m.group(0), 'a') 301 self.assertEqual(m.group(1), 'a') 302 self.assertEqual(m.group(1, 1), ('a', 'a')) 303 304 pat = re.compile('(?:(?P<a1>a)|(?P<b2>b))(?P<c3>c)?') 305 self.assertEqual(pat.match('a').group(1, 2, 3), ('a', None, None)) 306 self.assertEqual(pat.match('b').group('a1', 'b2', 'c3'), 307 (None, 'b', None)) 308 self.assertEqual(pat.match('ac').group(1, 'b2', 3), ('a', None, 'c')) 309 310 def test_re_groupref_exists(self): 311 self.assertEqual(re.match('^(\()?([^()]+)(?(1)\))$', '(a)').groups(), 312 ('(', 'a')) 313 self.assertEqual(re.match('^(\()?([^()]+)(?(1)\))$', 'a').groups(), 314 (None, 'a')) 315 self.assertIsNone(re.match('^(\()?([^()]+)(?(1)\))$', 'a)')) 316 self.assertIsNone(re.match('^(\()?([^()]+)(?(1)\))$', '(a')) 317 self.assertEqual(re.match('^(?:(a)|c)((?(1)b|d))$', 'ab').groups(), 318 ('a', 'b')) 319 self.assertEqual(re.match('^(?:(a)|c)((?(1)b|d))$', 'cd').groups(), 320 (None, 'd')) 321 self.assertEqual(re.match('^(?:(a)|c)((?(1)|d))$', 'cd').groups(), 322 (None, 'd')) 323 self.assertEqual(re.match('^(?:(a)|c)((?(1)|d))$', 'a').groups(), 324 ('a', '')) 325 326 # Tests for bug #1177831: exercise groups other than the first group 327 p = re.compile('(?P<g1>a)(?P<g2>b)?((?(g2)c|d))') 328 self.assertEqual(p.match('abc').groups(), 329 ('a', 'b', 'c')) 330 self.assertEqual(p.match('ad').groups(), 331 ('a', None, 'd')) 332 self.assertIsNone(p.match('abd')) 333 self.assertIsNone(p.match('ac')) 334 335 336 def test_re_groupref(self): 337 self.assertEqual(re.match(r'^(\|)?([^()]+)\1$', '|a|').groups(), 338 ('|', 'a')) 339 self.assertEqual(re.match(r'^(\|)?([^()]+)\1?$', 'a').groups(), 340 (None, 'a')) 341 self.assertIsNone(re.match(r'^(\|)?([^()]+)\1$', 'a|')) 342 self.assertIsNone(re.match(r'^(\|)?([^()]+)\1$', '|a')) 343 self.assertEqual(re.match(r'^(?:(a)|c)(\1)$', 'aa').groups(), 344 ('a', 'a')) 345 self.assertEqual(re.match(r'^(?:(a)|c)(\1)?$', 'c').groups(), 346 (None, None)) 347 348 def test_groupdict(self): 349 self.assertEqual(re.match('(?P<first>first) (?P<second>second)', 350 'first second').groupdict(), 351 {'first':'first', 'second':'second'}) 352 353 def test_expand(self): 354 self.assertEqual(re.match("(?P<first>first) (?P<second>second)", 355 "first second") 356 .expand(r"\2 \1 \g<second> \g<first>"), 357 "second first second first") 358 359 def test_repeat_minmax(self): 360 self.assertIsNone(re.match("^(\w){1}$", "abc")) 361 self.assertIsNone(re.match("^(\w){1}?$", "abc")) 362 self.assertIsNone(re.match("^(\w){1,2}$", "abc")) 363 self.assertIsNone(re.match("^(\w){1,2}?$", "abc")) 364 365 self.assertEqual(re.match("^(\w){3}$", "abc").group(1), "c") 366 self.assertEqual(re.match("^(\w){1,3}$", "abc").group(1), "c") 367 self.assertEqual(re.match("^(\w){1,4}$", "abc").group(1), "c") 368 self.assertEqual(re.match("^(\w){3,4}?$", "abc").group(1), "c") 369 self.assertEqual(re.match("^(\w){3}?$", "abc").group(1), "c") 370 self.assertEqual(re.match("^(\w){1,3}?$", "abc").group(1), "c") 371 self.assertEqual(re.match("^(\w){1,4}?$", "abc").group(1), "c") 372 self.assertEqual(re.match("^(\w){3,4}?$", "abc").group(1), "c") 373 374 self.assertIsNone(re.match("^x{1}$", "xxx")) 375 self.assertIsNone(re.match("^x{1}?$", "xxx")) 376 self.assertIsNone(re.match("^x{1,2}$", "xxx")) 377 self.assertIsNone(re.match("^x{1,2}?$", "xxx")) 378 379 self.assertTrue(re.match("^x{3}$", "xxx")) 380 self.assertTrue(re.match("^x{1,3}$", "xxx")) 381 self.assertTrue(re.match("^x{1,4}$", "xxx")) 382 self.assertTrue(re.match("^x{3,4}?$", "xxx")) 383 self.assertTrue(re.match("^x{3}?$", "xxx")) 384 self.assertTrue(re.match("^x{1,3}?$", "xxx")) 385 self.assertTrue(re.match("^x{1,4}?$", "xxx")) 386 self.assertTrue(re.match("^x{3,4}?$", "xxx")) 387 388 self.assertIsNone(re.match("^x{}$", "xxx")) 389 self.assertTrue(re.match("^x{}$", "x{}")) 390 391 def test_getattr(self): 392 self.assertEqual(re.match("(a)", "a").pos, 0) 393 self.assertEqual(re.match("(a)", "a").endpos, 1) 394 self.assertEqual(re.match("(a)", "a").string, "a") 395 self.assertEqual(re.match("(a)", "a").regs, ((0, 1), (0, 1))) 396 self.assertTrue(re.match("(a)", "a").re) 397 398 def test_special_escapes(self): 399 self.assertEqual(re.search(r"\b(b.)\b", 400 "abcd abc bcd bx").group(1), "bx") 401 self.assertEqual(re.search(r"\B(b.)\B", 402 "abc bcd bc abxd").group(1), "bx") 403 self.assertEqual(re.search(r"\b(b.)\b", 404 "abcd abc bcd bx", re.LOCALE).group(1), "bx") 405 self.assertEqual(re.search(r"\B(b.)\B", 406 "abc bcd bc abxd", re.LOCALE).group(1), "bx") 407 if have_unicode: 408 self.assertEqual(re.search(r"\b(b.)\b", 409 "abcd abc bcd bx", re.UNICODE).group(1), "bx") 410 self.assertEqual(re.search(r"\B(b.)\B", 411 "abc bcd bc abxd", re.UNICODE).group(1), "bx") 412 self.assertEqual(re.search(r"^abc$", "\nabc\n", re.M).group(0), "abc") 413 self.assertEqual(re.search(r"^\Aabc\Z$", "abc", re.M).group(0), "abc") 414 self.assertIsNone(re.search(r"^\Aabc\Z$", "\nabc\n", re.M)) 415 self.assertEqual(re.search(r"\b(b.)\b", 416 u"abcd abc bcd bx").group(1), "bx") 417 self.assertEqual(re.search(r"\B(b.)\B", 418 u"abc bcd bc abxd").group(1), "bx") 419 self.assertEqual(re.search(r"^abc$", u"\nabc\n", re.M).group(0), "abc") 420 self.assertEqual(re.search(r"^\Aabc\Z$", u"abc", re.M).group(0), "abc") 421 self.assertIsNone(re.search(r"^\Aabc\Z$", u"\nabc\n", re.M)) 422 self.assertEqual(re.search(r"\d\D\w\W\s\S", 423 "1aa! a").group(0), "1aa! a") 424 self.assertEqual(re.search(r"\d\D\w\W\s\S", 425 "1aa! a", re.LOCALE).group(0), "1aa! a") 426 if have_unicode: 427 self.assertEqual(re.search(r"\d\D\w\W\s\S", 428 "1aa! a", re.UNICODE).group(0), "1aa! a") 429 430 def test_other_escapes(self): 431 self.assertRaises(re.error, re.compile, "\\") 432 self.assertEqual(re.match(r"\(", '(').group(), '(') 433 self.assertIsNone(re.match(r"\(", ')')) 434 self.assertEqual(re.match(r"\\", '\\').group(), '\\') 435 self.assertEqual(re.match(r"[\]]", ']').group(), ']') 436 self.assertIsNone(re.match(r"[\]]", '[')) 437 self.assertEqual(re.match(r"[a\-c]", '-').group(), '-') 438 self.assertIsNone(re.match(r"[a\-c]", 'b')) 439 self.assertEqual(re.match(r"[\^a]+", 'a^').group(), 'a^') 440 self.assertIsNone(re.match(r"[\^a]+", 'b')) 441 re.purge() # for warnings 442 for c in 'ceghijklmopquyzCEFGHIJKLMNOPQRTUVXY': 443 warn = FutureWarning if c in 'Uu' else DeprecationWarning 444 with check_py3k_warnings(('', warn)): 445 self.assertEqual(re.match('\\%c$' % c, c).group(), c) 446 self.assertIsNone(re.match('\\%c' % c, 'a')) 447 for c in 'ceghijklmopquyzABCEFGHIJKLMNOPQRTUVXYZ': 448 warn = FutureWarning if c in 'Uu' else DeprecationWarning 449 with check_py3k_warnings(('', warn)): 450 self.assertEqual(re.match('[\\%c]$' % c, c).group(), c) 451 self.assertIsNone(re.match('[\\%c]' % c, 'a')) 452 453 def test_string_boundaries(self): 454 # See http://bugs.python.org/issue10713 455 self.assertEqual(re.search(r"\b(abc)\b", "abc").group(1), 456 "abc") 457 # There's a word boundary at the start of a string. 458 self.assertTrue(re.match(r"\b", "abc")) 459 # A non-empty string includes a non-boundary zero-length match. 460 self.assertTrue(re.search(r"\B", "abc")) 461 # There is no non-boundary match at the start of a string. 462 self.assertFalse(re.match(r"\B", "abc")) 463 # However, an empty string contains no word boundaries, and also no 464 # non-boundaries. 465 self.assertIsNone(re.search(r"\B", "")) 466 # This one is questionable and different from the perlre behaviour, 467 # but describes current behavior. 468 self.assertIsNone(re.search(r"\b", "")) 469 # A single word-character string has two boundaries, but no 470 # non-boundary gaps. 471 self.assertEqual(len(re.findall(r"\b", "a")), 2) 472 self.assertEqual(len(re.findall(r"\B", "a")), 0) 473 # If there are no words, there are no boundaries 474 self.assertEqual(len(re.findall(r"\b", " ")), 0) 475 self.assertEqual(len(re.findall(r"\b", " ")), 0) 476 # Can match around the whitespace. 477 self.assertEqual(len(re.findall(r"\B", " ")), 2) 478 479 @requires_unicode 480 def test_bigcharset(self): 481 self.assertEqual(re.match(u(r"([\u2222\u2223])"), 482 unichr(0x2222)).group(1), unichr(0x2222)) 483 self.assertEqual(re.match(u(r"([\u2222\u2223])"), 484 unichr(0x2222), re.UNICODE).group(1), unichr(0x2222)) 485 r = u'[%s]' % u''.join(map(unichr, range(256, 2**16, 255))) 486 self.assertEqual(re.match(r, unichr(0xff01), re.UNICODE).group(), unichr(0xff01)) 487 488 def test_big_codesize(self): 489 # Issue #1160 490 r = re.compile('|'.join(('%d'%x for x in range(10000)))) 491 self.assertTrue(r.match('1000')) 492 self.assertTrue(r.match('9999')) 493 494 def test_anyall(self): 495 self.assertEqual(re.match("a.b", "a\nb", re.DOTALL).group(0), 496 "a\nb") 497 self.assertEqual(re.match("a.*b", "a\n\nb", re.DOTALL).group(0), 498 "a\n\nb") 499 500 def test_lookahead(self): 501 self.assertEqual(re.match("(a(?=\s[^a]))", "a b").group(1), "a") 502 self.assertEqual(re.match("(a(?=\s[^a]*))", "a b").group(1), "a") 503 self.assertEqual(re.match("(a(?=\s[abc]))", "a b").group(1), "a") 504 self.assertEqual(re.match("(a(?=\s[abc]*))", "a bc").group(1), "a") 505 self.assertEqual(re.match(r"(a)(?=\s\1)", "a a").group(1), "a") 506 self.assertEqual(re.match(r"(a)(?=\s\1*)", "a aa").group(1), "a") 507 self.assertEqual(re.match(r"(a)(?=\s(abc|a))", "a a").group(1), "a") 508 509 self.assertEqual(re.match(r"(a(?!\s[^a]))", "a a").group(1), "a") 510 self.assertEqual(re.match(r"(a(?!\s[abc]))", "a d").group(1), "a") 511 self.assertEqual(re.match(r"(a)(?!\s\1)", "a b").group(1), "a") 512 self.assertEqual(re.match(r"(a)(?!\s(abc|a))", "a b").group(1), "a") 513 514 # Group reference. 515 self.assertTrue(re.match(r'(a)b(?=\1)a', 'aba')) 516 self.assertIsNone(re.match(r'(a)b(?=\1)c', 'abac')) 517 # Named group reference. 518 self.assertTrue(re.match(r'(?P<g>a)b(?=(?P=g))a', 'aba')) 519 self.assertIsNone(re.match(r'(?P<g>a)b(?=(?P=g))c', 'abac')) 520 # Conditional group reference. 521 self.assertTrue(re.match(r'(?:(a)|(x))b(?=(?(2)x|c))c', 'abc')) 522 self.assertIsNone(re.match(r'(?:(a)|(x))b(?=(?(2)c|x))c', 'abc')) 523 self.assertTrue(re.match(r'(?:(a)|(x))b(?=(?(2)x|c))c', 'abc')) 524 self.assertIsNone(re.match(r'(?:(a)|(x))b(?=(?(1)b|x))c', 'abc')) 525 self.assertTrue(re.match(r'(?:(a)|(x))b(?=(?(1)c|x))c', 'abc')) 526 # Group used before defined. 527 self.assertTrue(re.match(r'(a)b(?=(?(2)x|c))(c)', 'abc')) 528 self.assertIsNone(re.match(r'(a)b(?=(?(2)b|x))(c)', 'abc')) 529 self.assertTrue(re.match(r'(a)b(?=(?(1)c|x))(c)', 'abc')) 530 531 def test_lookbehind(self): 532 self.assertTrue(re.match(r'ab(?<=b)c', 'abc')) 533 self.assertIsNone(re.match(r'ab(?<=c)c', 'abc')) 534 self.assertIsNone(re.match(r'ab(?<!b)c', 'abc')) 535 self.assertTrue(re.match(r'ab(?<!c)c', 'abc')) 536 # Group reference. 537 with check_warnings(('', RuntimeWarning)): 538 re.compile(r'(a)a(?<=\1)c') 539 # Named group reference. 540 with check_warnings(('', RuntimeWarning)): 541 re.compile(r'(?P<g>a)a(?<=(?P=g))c') 542 # Conditional group reference. 543 with check_warnings(('', RuntimeWarning)): 544 re.compile(r'(a)b(?<=(?(1)b|x))c') 545 # Group used before defined. 546 with check_warnings(('', RuntimeWarning)): 547 re.compile(r'(a)b(?<=(?(2)b|x))(c)') 548 549 def test_ignore_case(self): 550 self.assertEqual(re.match("abc", "ABC", re.I).group(0), "ABC") 551 self.assertEqual(re.match("abc", u"ABC", re.I).group(0), "ABC") 552 self.assertEqual(re.match(r"(a\s[^a])", "a b", re.I).group(1), "a b") 553 self.assertEqual(re.match(r"(a\s[^a]*)", "a bb", re.I).group(1), "a bb") 554 self.assertEqual(re.match(r"(a\s[abc])", "a b", re.I).group(1), "a b") 555 self.assertEqual(re.match(r"(a\s[abc]*)", "a bb", re.I).group(1), "a bb") 556 self.assertEqual(re.match(r"((a)\s\2)", "a a", re.I).group(1), "a a") 557 self.assertEqual(re.match(r"((a)\s\2*)", "a aa", re.I).group(1), "a aa") 558 self.assertEqual(re.match(r"((a)\s(abc|a))", "a a", re.I).group(1), "a a") 559 self.assertEqual(re.match(r"((a)\s(abc|a)*)", "a aa", re.I).group(1), "a aa") 560 561 if have_unicode: 562 assert u(r'\u212a').lower() == u'k' # 'K' 563 self.assertTrue(re.match(ur'K', u(r'\u212a'), re.U | re.I)) 564 self.assertTrue(re.match(ur'k', u(r'\u212a'), re.U | re.I)) 565 self.assertTrue(re.match(u(r'\u212a'), u'K', re.U | re.I)) 566 self.assertTrue(re.match(u(r'\u212a'), u'k', re.U | re.I)) 567 assert u(r'\u017f').upper() == u'S' # 'ſ' 568 self.assertTrue(re.match(ur'S', u(r'\u017f'), re.U | re.I)) 569 self.assertTrue(re.match(ur's', u(r'\u017f'), re.U | re.I)) 570 self.assertTrue(re.match(u(r'\u017f'), u'S', re.U | re.I)) 571 self.assertTrue(re.match(u(r'\u017f'), u's', re.U | re.I)) 572 573 def test_ignore_case_set(self): 574 self.assertTrue(re.match(r'[19A]', 'A', re.I)) 575 self.assertTrue(re.match(r'[19a]', 'a', re.I)) 576 self.assertTrue(re.match(r'[19a]', 'A', re.I)) 577 self.assertTrue(re.match(r'[19A]', 'a', re.I)) 578 if have_unicode: 579 self.assertTrue(re.match(ur'[19A]', u'A', re.U | re.I)) 580 self.assertTrue(re.match(ur'[19a]', u'a', re.U | re.I)) 581 self.assertTrue(re.match(ur'[19a]', u'A', re.U | re.I)) 582 self.assertTrue(re.match(ur'[19A]', u'a', re.U | re.I)) 583 assert u(r'\u212a').lower() == u'k' # 'K' 584 self.assertTrue(re.match(u(r'[19K]'), u(r'\u212a'), re.U | re.I)) 585 self.assertTrue(re.match(u(r'[19k]'), u(r'\u212a'), re.U | re.I)) 586 self.assertTrue(re.match(u(r'[19\u212a]'), u'K', re.U | re.I)) 587 self.assertTrue(re.match(u(r'[19\u212a]'), u'k', re.U | re.I)) 588 assert u(r'\u017f').upper() == u'S' # 'ſ' 589 self.assertTrue(re.match(ur'[19S]', u(r'\u017f'), re.U | re.I)) 590 self.assertTrue(re.match(ur'[19s]', u(r'\u017f'), re.U | re.I)) 591 self.assertTrue(re.match(u(r'[19\u017f]'), u'S', re.U | re.I)) 592 self.assertTrue(re.match(u(r'[19\u017f]'), u's', re.U | re.I)) 593 594 def test_ignore_case_range(self): 595 # Issues #3511, #17381. 596 self.assertTrue(re.match(r'[9-a]', '_', re.I)) 597 self.assertIsNone(re.match(r'[9-A]', '_', re.I)) 598 self.assertTrue(re.match(r'[\xc0-\xde]', '\xd7', re.I)) 599 self.assertIsNone(re.match(r'[\xc0-\xde]', '\xf7', re.I)) 600 self.assertTrue(re.match(r'[\xe0-\xfe]', '\xf7',re.I)) 601 self.assertIsNone(re.match(r'[\xe0-\xfe]', '\xd7', re.I)) 602 if have_unicode: 603 self.assertTrue(re.match(u(r'[9-a]'), u(r'_'), re.U | re.I)) 604 self.assertIsNone(re.match(u(r'[9-A]'), u(r'_'), re.U | re.I)) 605 self.assertTrue(re.match(u(r'[\xc0-\xde]'), 606 u(r'\xd7'), re.U | re.I)) 607 self.assertIsNone(re.match(u(r'[\xc0-\xde]'), 608 u(r'\xf7'), re.U | re.I)) 609 self.assertTrue(re.match(u(r'[\xe0-\xfe]'), 610 u(r'\xf7'), re.U | re.I)) 611 self.assertIsNone(re.match(u(r'[\xe0-\xfe]'), 612 u(r'\xd7'), re.U | re.I)) 613 self.assertTrue(re.match(u(r'[\u0430-\u045f]'), 614 u(r'\u0450'), re.U | re.I)) 615 self.assertTrue(re.match(u(r'[\u0430-\u045f]'), 616 u(r'\u0400'), re.U | re.I)) 617 self.assertTrue(re.match(u(r'[\u0400-\u042f]'), 618 u(r'\u0450'), re.U | re.I)) 619 self.assertTrue(re.match(u(r'[\u0400-\u042f]'), 620 u(r'\u0400'), re.U | re.I)) 621 if sys.maxunicode > 0xffff: 622 self.assertTrue(re.match(u(r'[\U00010428-\U0001044f]'), 623 u(r'\U00010428'), re.U | re.I)) 624 self.assertTrue(re.match(u(r'[\U00010428-\U0001044f]'), 625 u(r'\U00010400'), re.U | re.I)) 626 self.assertTrue(re.match(u(r'[\U00010400-\U00010427]'), 627 u(r'\U00010428'), re.U | re.I)) 628 self.assertTrue(re.match(u(r'[\U00010400-\U00010427]'), 629 u(r'\U00010400'), re.U | re.I)) 630 631 assert u(r'\u212a').lower() == u'k' # 'K' 632 self.assertTrue(re.match(ur'[J-M]', u(r'\u212a'), re.U | re.I)) 633 self.assertTrue(re.match(ur'[j-m]', u(r'\u212a'), re.U | re.I)) 634 self.assertTrue(re.match(u(r'[\u2129-\u212b]'), u'K', re.U | re.I)) 635 self.assertTrue(re.match(u(r'[\u2129-\u212b]'), u'k', re.U | re.I)) 636 assert u(r'\u017f').upper() == u'S' # 'ſ' 637 self.assertTrue(re.match(ur'[R-T]', u(r'\u017f'), re.U | re.I)) 638 self.assertTrue(re.match(ur'[r-t]', u(r'\u017f'), re.U | re.I)) 639 self.assertTrue(re.match(u(r'[\u017e-\u0180]'), u'S', re.U | re.I)) 640 self.assertTrue(re.match(u(r'[\u017e-\u0180]'), u's', re.U | re.I)) 641 642 def test_category(self): 643 self.assertEqual(re.match(r"(\s)", " ").group(1), " ") 644 645 def test_getlower(self): 646 import _sre 647 self.assertEqual(_sre.getlower(ord('A'), 0), ord('a')) 648 self.assertEqual(_sre.getlower(ord('A'), re.LOCALE), ord('a')) 649 if have_unicode: 650 self.assertEqual(_sre.getlower(ord('A'), re.UNICODE), ord('a')) 651 652 self.assertEqual(re.match("abc", "ABC", re.I).group(0), "ABC") 653 self.assertEqual(re.match("abc", u"ABC", re.I).group(0), "ABC") 654 655 def test_not_literal(self): 656 self.assertEqual(re.search("\s([^a])", " b").group(1), "b") 657 self.assertEqual(re.search("\s([^a]*)", " bb").group(1), "bb") 658 659 def test_search_coverage(self): 660 self.assertEqual(re.search("\s(b)", " b").group(1), "b") 661 self.assertEqual(re.search("a\s", "a ").group(0), "a ") 662 663 def assertMatch(self, pattern, text, match=None, span=None, 664 matcher=re.match): 665 if match is None and span is None: 666 # the pattern matches the whole text 667 match = text 668 span = (0, len(text)) 669 elif match is None or span is None: 670 raise ValueError('If match is not None, span should be specified ' 671 '(and vice versa).') 672 m = matcher(pattern, text) 673 self.assertTrue(m) 674 self.assertEqual(m.group(), match) 675 self.assertEqual(m.span(), span) 676 677 @requires_unicode 678 def test_re_escape(self): 679 alnum_chars = unicode(string.ascii_letters + string.digits) 680 p = u''.join(unichr(i) for i in range(256)) 681 for c in p: 682 if c in alnum_chars: 683 self.assertEqual(re.escape(c), c) 684 elif c == u'\x00': 685 self.assertEqual(re.escape(c), u'\\000') 686 else: 687 self.assertEqual(re.escape(c), u'\\' + c) 688 self.assertMatch(re.escape(c), c) 689 self.assertMatch(re.escape(p), p) 690 691 def test_re_escape_byte(self): 692 alnum_chars = string.ascii_letters + string.digits 693 p = ''.join(chr(i) for i in range(256)) 694 for b in p: 695 if b in alnum_chars: 696 self.assertEqual(re.escape(b), b) 697 elif b == b'\x00': 698 self.assertEqual(re.escape(b), b'\\000') 699 else: 700 self.assertEqual(re.escape(b), b'\\' + b) 701 self.assertMatch(re.escape(b), b) 702 self.assertMatch(re.escape(p), p) 703 704 @requires_unicode 705 def test_re_escape_non_ascii(self): 706 s = u(r'xxx\u2620\u2620\u2620xxx') 707 s_escaped = re.escape(s) 708 self.assertEqual(s_escaped, u(r'xxx\\\u2620\\\u2620\\\u2620xxx')) 709 self.assertMatch(s_escaped, s) 710 self.assertMatch(u'.%s+.' % re.escape(unichr(0x2620)), s, 711 u(r'x\u2620\u2620\u2620x'), (2, 7), re.search) 712 713 def test_re_escape_non_ascii_bytes(self): 714 b = b'y\xe2\x98\xa0y\xe2\x98\xa0y' 715 b_escaped = re.escape(b) 716 self.assertEqual(b_escaped, b'y\\\xe2\\\x98\\\xa0y\\\xe2\\\x98\\\xa0y') 717 self.assertMatch(b_escaped, b) 718 res = re.findall(re.escape(b'\xe2\x98\xa0'), b) 719 self.assertEqual(len(res), 2) 720 721 def test_pickling(self): 722 import pickle 723 self.pickle_test(pickle) 724 import cPickle 725 self.pickle_test(cPickle) 726 # old pickles expect the _compile() reconstructor in sre module 727 import_module("sre", deprecated=True) 728 from sre import _compile 729 # current pickle expects the _compile() reconstructor in re module 730 from re import _compile 731 732 def pickle_test(self, pickle): 733 oldpat = re.compile('a(?:b|(c|e){1,2}?|d)+?(.)') 734 for proto in range(pickle.HIGHEST_PROTOCOL + 1): 735 pickled = pickle.dumps(oldpat, proto) 736 newpat = pickle.loads(pickled) 737 self.assertEqual(newpat, oldpat) 738 739 def test_constants(self): 740 self.assertEqual(re.I, re.IGNORECASE) 741 self.assertEqual(re.L, re.LOCALE) 742 self.assertEqual(re.M, re.MULTILINE) 743 self.assertEqual(re.S, re.DOTALL) 744 self.assertEqual(re.X, re.VERBOSE) 745 746 def test_flags(self): 747 for flag in [re.I, re.M, re.X, re.S, re.L]: 748 self.assertTrue(re.compile('^pattern$', flag)) 749 750 def test_sre_character_literals(self): 751 for i in [0, 8, 16, 32, 64, 127, 128, 255]: 752 self.assertTrue(re.match(r"\%03o" % i, chr(i))) 753 self.assertTrue(re.match(r"\%03o0" % i, chr(i)+"0")) 754 self.assertTrue(re.match(r"\%03o8" % i, chr(i)+"8")) 755 self.assertTrue(re.match(r"\x%02x" % i, chr(i))) 756 self.assertTrue(re.match(r"\x%02x0" % i, chr(i)+"0")) 757 self.assertTrue(re.match(r"\x%02xz" % i, chr(i)+"z")) 758 self.assertRaises(re.error, re.match, "\911", "") 759 760 def test_sre_character_class_literals(self): 761 for i in [0, 8, 16, 32, 64, 127, 128, 255]: 762 self.assertTrue(re.match(r"[\%03o]" % i, chr(i))) 763 self.assertTrue(re.match(r"[\%03o0]" % i, chr(i))) 764 self.assertTrue(re.match(r"[\%03o8]" % i, chr(i))) 765 self.assertTrue(re.match(r"[\x%02x]" % i, chr(i))) 766 self.assertTrue(re.match(r"[\x%02x0]" % i, chr(i))) 767 self.assertTrue(re.match(r"[\x%02xz]" % i, chr(i))) 768 self.assertRaises(re.error, re.match, "[\911]", "") 769 770 def test_bug_113254(self): 771 self.assertEqual(re.match(r'(a)|(b)', 'b').start(1), -1) 772 self.assertEqual(re.match(r'(a)|(b)', 'b').end(1), -1) 773 self.assertEqual(re.match(r'(a)|(b)', 'b').span(1), (-1, -1)) 774 775 def test_bug_527371(self): 776 # bug described in patches 527371/672491 777 self.assertIsNone(re.match(r'(a)?a','a').lastindex) 778 self.assertEqual(re.match(r'(a)(b)?b','ab').lastindex, 1) 779 self.assertEqual(re.match(r'(?P<a>a)(?P<b>b)?b','ab').lastgroup, 'a') 780 self.assertEqual(re.match("(?P<a>a(b))", "ab").lastgroup, 'a') 781 self.assertEqual(re.match("((a))", "a").lastindex, 1) 782 783 def test_bug_545855(self): 784 # bug 545855 -- This pattern failed to cause a compile error as it 785 # should, instead provoking a TypeError. 786 self.assertRaises(re.error, re.compile, 'foo[a-') 787 788 def test_bug_418626(self): 789 # bugs 418626 at al. -- Testing Greg Chapman's addition of op code 790 # SRE_OP_MIN_REPEAT_ONE for eliminating recursion on simple uses of 791 # pattern '*?' on a long string. 792 self.assertEqual(re.match('.*?c', 10000*'ab'+'cd').end(0), 20001) 793 self.assertEqual(re.match('.*?cd', 5000*'ab'+'c'+5000*'ab'+'cde').end(0), 794 20003) 795 self.assertEqual(re.match('.*?cd', 20000*'abc'+'de').end(0), 60001) 796 # non-simple '*?' still used to hit the recursion limit, before the 797 # non-recursive scheme was implemented. 798 self.assertEqual(re.search('(a|b)*?c', 10000*'ab'+'cd').end(0), 20001) 799 800 @requires_unicode 801 def test_bug_612074(self): 802 pat=u"["+re.escape(unichr(0x2039))+u"]" 803 self.assertEqual(re.compile(pat) and 1, 1) 804 805 def test_stack_overflow(self): 806 # nasty cases that used to overflow the straightforward recursive 807 # implementation of repeated groups. 808 self.assertEqual(re.match('(x)*', 50000*'x').group(1), 'x') 809 self.assertEqual(re.match('(x)*y', 50000*'x'+'y').group(1), 'x') 810 self.assertEqual(re.match('(x)*?y', 50000*'x'+'y').group(1), 'x') 811 812 def test_unlimited_zero_width_repeat(self): 813 # Issue #9669 814 self.assertIsNone(re.match(r'(?:a?)*y', 'z')) 815 self.assertIsNone(re.match(r'(?:a?)+y', 'z')) 816 self.assertIsNone(re.match(r'(?:a?){2,}y', 'z')) 817 self.assertIsNone(re.match(r'(?:a?)*?y', 'z')) 818 self.assertIsNone(re.match(r'(?:a?)+?y', 'z')) 819 self.assertIsNone(re.match(r'(?:a?){2,}?y', 'z')) 820 821 def test_scanner(self): 822 def s_ident(scanner, token): return token 823 def s_operator(scanner, token): return "op%s" % token 824 def s_float(scanner, token): return float(token) 825 def s_int(scanner, token): return int(token) 826 827 scanner = Scanner([ 828 (r"[a-zA-Z_]\w*", s_ident), 829 (r"\d+\.\d*", s_float), 830 (r"\d+", s_int), 831 (r"=|\+|-|\*|/", s_operator), 832 (r"\s+", None), 833 ]) 834 835 self.assertTrue(scanner.scanner.scanner("").pattern) 836 837 self.assertEqual(scanner.scan("sum = 3*foo + 312.50 + bar"), 838 (['sum', 'op=', 3, 'op*', 'foo', 'op+', 312.5, 839 'op+', 'bar'], '')) 840 841 def test_bug_448951(self): 842 # bug 448951 (similar to 429357, but with single char match) 843 # (Also test greedy matches.) 844 for op in '','?','*': 845 self.assertEqual(re.match(r'((.%s):)?z'%op, 'z').groups(), 846 (None, None)) 847 self.assertEqual(re.match(r'((.%s):)?z'%op, 'a:z').groups(), 848 ('a:', 'a')) 849 850 def test_bug_725106(self): 851 # capturing groups in alternatives in repeats 852 self.assertEqual(re.match('^((a)|b)*', 'abc').groups(), 853 ('b', 'a')) 854 self.assertEqual(re.match('^(([ab])|c)*', 'abc').groups(), 855 ('c', 'b')) 856 self.assertEqual(re.match('^((d)|[ab])*', 'abc').groups(), 857 ('b', None)) 858 self.assertEqual(re.match('^((a)c|[ab])*', 'abc').groups(), 859 ('b', None)) 860 self.assertEqual(re.match('^((a)|b)*?c', 'abc').groups(), 861 ('b', 'a')) 862 self.assertEqual(re.match('^(([ab])|c)*?d', 'abcd').groups(), 863 ('c', 'b')) 864 self.assertEqual(re.match('^((d)|[ab])*?c', 'abc').groups(), 865 ('b', None)) 866 self.assertEqual(re.match('^((a)c|[ab])*?c', 'abc').groups(), 867 ('b', None)) 868 869 def test_bug_725149(self): 870 # mark_stack_base restoring before restoring marks 871 self.assertEqual(re.match('(a)(?:(?=(b)*)c)*', 'abb').groups(), 872 ('a', None)) 873 self.assertEqual(re.match('(a)((?!(b)*))*', 'abb').groups(), 874 ('a', None, None)) 875 876 @requires_unicode 877 def test_bug_764548(self): 878 # bug 764548, re.compile() barfs on str/unicode subclasses 879 class my_unicode(unicode): pass 880 pat = re.compile(my_unicode("abc")) 881 self.assertIsNone(pat.match("xyz")) 882 883 def test_finditer(self): 884 iter = re.finditer(r":+", "a:b::c:::d") 885 self.assertEqual([item.group(0) for item in iter], 886 [":", "::", ":::"]) 887 888 @requires_unicode 889 def test_bug_926075(self): 890 self.assertIsNot(re.compile('bug_926075'), 891 re.compile(u'bug_926075')) 892 893 @requires_unicode 894 def test_bug_931848(self): 895 pattern = u(r"[\u002E\u3002\uFF0E\uFF61]") 896 self.assertEqual(re.compile(pattern).split("a.b.c"), 897 ['a','b','c']) 898 899 def test_bug_581080(self): 900 iter = re.finditer(r"\s", "a b") 901 self.assertEqual(iter.next().span(), (1,2)) 902 self.assertRaises(StopIteration, iter.next) 903 904 scanner = re.compile(r"\s").scanner("a b") 905 self.assertEqual(scanner.search().span(), (1, 2)) 906 self.assertIsNone(scanner.search()) 907 908 def test_bug_817234(self): 909 iter = re.finditer(r".*", "asdf") 910 self.assertEqual(iter.next().span(), (0, 4)) 911 self.assertEqual(iter.next().span(), (4, 4)) 912 self.assertRaises(StopIteration, iter.next) 913 914 @requires_unicode 915 def test_bug_6561(self): 916 # '\d' should match characters in Unicode category 'Nd' 917 # (Number, Decimal Digit), but not those in 'Nl' (Number, 918 # Letter) or 'No' (Number, Other). 919 decimal_digits = [ 920 unichr(0x0037), # '\N{DIGIT SEVEN}', category 'Nd' 921 unichr(0x0e58), # '\N{THAI DIGIT SIX}', category 'Nd' 922 unichr(0xff10), # '\N{FULLWIDTH DIGIT ZERO}', category 'Nd' 923 ] 924 for x in decimal_digits: 925 self.assertEqual(re.match('^\d$', x, re.UNICODE).group(0), x) 926 927 not_decimal_digits = [ 928 unichr(0x2165), # '\N{ROMAN NUMERAL SIX}', category 'Nl' 929 unichr(0x3039), # '\N{HANGZHOU NUMERAL TWENTY}', category 'Nl' 930 unichr(0x2082), # '\N{SUBSCRIPT TWO}', category 'No' 931 unichr(0x32b4), # '\N{CIRCLED NUMBER THIRTY NINE}', category 'No' 932 ] 933 for x in not_decimal_digits: 934 self.assertIsNone(re.match('^\d$', x, re.UNICODE)) 935 936 def test_empty_array(self): 937 # SF buf 1647541 938 import array 939 typecodes = 'cbBhHiIlLfd' 940 if have_unicode: 941 typecodes += 'u' 942 for typecode in typecodes: 943 a = array.array(typecode) 944 self.assertIsNone(re.compile("bla").match(a)) 945 self.assertEqual(re.compile("").match(a).groups(), ()) 946 947 @requires_unicode 948 def test_inline_flags(self): 949 # Bug #1700 950 upper_char = unichr(0x1ea0) # Latin Capital Letter A with Dot Bellow 951 lower_char = unichr(0x1ea1) # Latin Small Letter A with Dot Bellow 952 953 p = re.compile(upper_char, re.I | re.U) 954 q = p.match(lower_char) 955 self.assertTrue(q) 956 957 p = re.compile(lower_char, re.I | re.U) 958 q = p.match(upper_char) 959 self.assertTrue(q) 960 961 p = re.compile('(?i)' + upper_char, re.U) 962 q = p.match(lower_char) 963 self.assertTrue(q) 964 965 p = re.compile('(?i)' + lower_char, re.U) 966 q = p.match(upper_char) 967 self.assertTrue(q) 968 969 p = re.compile('(?iu)' + upper_char) 970 q = p.match(lower_char) 971 self.assertTrue(q) 972 973 p = re.compile('(?iu)' + lower_char) 974 q = p.match(upper_char) 975 self.assertTrue(q) 976 977 self.assertTrue(re.match('(?ixu) ' + upper_char, lower_char)) 978 self.assertTrue(re.match('(?ixu) ' + lower_char, upper_char)) 979 980 # Incompatibilities 981 re.purge() 982 with check_py3k_warnings(): 983 re.compile('', re.LOCALE|re.UNICODE) 984 with check_py3k_warnings(): 985 re.compile('(?L)', re.UNICODE) 986 with check_py3k_warnings(): 987 re.compile('(?u)', re.LOCALE) 988 with check_py3k_warnings(): 989 re.compile('(?Lu)') 990 with check_py3k_warnings(): 991 re.compile('(?uL)') 992 993 def test_dollar_matches_twice(self): 994 "$ matches the end of string, and just before the terminating \n" 995 pattern = re.compile('$') 996 self.assertEqual(pattern.sub('#', 'a\nb\n'), 'a\nb#\n#') 997 self.assertEqual(pattern.sub('#', 'a\nb\nc'), 'a\nb\nc#') 998 self.assertEqual(pattern.sub('#', '\n'), '#\n#') 999 1000 pattern = re.compile('$', re.MULTILINE) 1001 self.assertEqual(pattern.sub('#', 'a\nb\n' ), 'a#\nb#\n#' ) 1002 self.assertEqual(pattern.sub('#', 'a\nb\nc'), 'a#\nb#\nc#') 1003 self.assertEqual(pattern.sub('#', '\n'), '#\n#') 1004 1005 def test_dealloc(self): 1006 # issue 3299: check for segfault in debug build 1007 import _sre 1008 # the overflow limit is different on wide and narrow builds and it 1009 # depends on the definition of SRE_CODE (see sre.h). 1010 # 2**128 should be big enough to overflow on both. For smaller values 1011 # a RuntimeError is raised instead of OverflowError. 1012 long_overflow = 2**128 1013 self.assertRaises(TypeError, re.finditer, "a", {}) 1014 self.assertRaises(OverflowError, _sre.compile, "abc", 0, [long_overflow]) 1015 1016 def test_compile(self): 1017 # Test return value when given string and pattern as parameter 1018 pattern = re.compile('random pattern') 1019 self.assertIsInstance(pattern, re._pattern_type) 1020 same_pattern = re.compile(pattern) 1021 self.assertIsInstance(same_pattern, re._pattern_type) 1022 self.assertIs(same_pattern, pattern) 1023 # Test behaviour when not given a string or pattern as parameter 1024 self.assertRaises(TypeError, re.compile, 0) 1025 1026 def test_bug_13899(self): 1027 # Issue #13899: re pattern r"[\A]" should work like "A" but matches 1028 # nothing. Ditto B and Z. 1029 with check_py3k_warnings(): 1030 self.assertEqual(re.findall(r'[\A\B\b\C\Z]', 'AB\bCZ'), 1031 ['A', 'B', '\b', 'C', 'Z']) 1032 1033 @precisionbigmemtest(size=_2G, memuse=1) 1034 def test_large_search(self, size): 1035 # Issue #10182: indices were 32-bit-truncated. 1036 s = 'a' * size 1037 m = re.search('$', s) 1038 self.assertIsNotNone(m) 1039 self.assertEqual(m.start(), size) 1040 self.assertEqual(m.end(), size) 1041 1042 # The huge memuse is because of re.sub() using a list and a join() 1043 # to create the replacement result. 1044 @precisionbigmemtest(size=_2G, memuse=16 + 2) 1045 def test_large_subn(self, size): 1046 # Issue #10182: indices were 32-bit-truncated. 1047 s = 'a' * size 1048 r, n = re.subn('', '', s) 1049 self.assertEqual(r, s) 1050 self.assertEqual(n, size + 1) 1051 1052 1053 def test_repeat_minmax_overflow(self): 1054 # Issue #13169 1055 string = "x" * 100000 1056 self.assertEqual(re.match(r".{65535}", string).span(), (0, 65535)) 1057 self.assertEqual(re.match(r".{,65535}", string).span(), (0, 65535)) 1058 self.assertEqual(re.match(r".{65535,}?", string).span(), (0, 65535)) 1059 self.assertEqual(re.match(r".{65536}", string).span(), (0, 65536)) 1060 self.assertEqual(re.match(r".{,65536}", string).span(), (0, 65536)) 1061 self.assertEqual(re.match(r".{65536,}?", string).span(), (0, 65536)) 1062 # 2**128 should be big enough to overflow both SRE_CODE and Py_ssize_t. 1063 self.assertRaises(OverflowError, re.compile, r".{%d}" % 2**128) 1064 self.assertRaises(OverflowError, re.compile, r".{,%d}" % 2**128) 1065 self.assertRaises(OverflowError, re.compile, r".{%d,}?" % 2**128) 1066 self.assertRaises(OverflowError, re.compile, r".{%d,%d}" % (2**129, 2**128)) 1067 1068 @cpython_only 1069 def test_repeat_minmax_overflow_maxrepeat(self): 1070 try: 1071 from _sre import MAXREPEAT 1072 except ImportError: 1073 self.skipTest('requires _sre.MAXREPEAT constant') 1074 string = "x" * 100000 1075 self.assertIsNone(re.match(r".{%d}" % (MAXREPEAT - 1), string)) 1076 self.assertEqual(re.match(r".{,%d}" % (MAXREPEAT - 1), string).span(), 1077 (0, 100000)) 1078 self.assertIsNone(re.match(r".{%d,}?" % (MAXREPEAT - 1), string)) 1079 self.assertRaises(OverflowError, re.compile, r".{%d}" % MAXREPEAT) 1080 self.assertRaises(OverflowError, re.compile, r".{,%d}" % MAXREPEAT) 1081 self.assertRaises(OverflowError, re.compile, r".{%d,}?" % MAXREPEAT) 1082 1083 def test_backref_group_name_in_exception(self): 1084 # Issue 17341: Poor error message when compiling invalid regex 1085 with self.assertRaisesRegexp(sre_constants.error, '<foo>'): 1086 re.compile('(?P=<foo>)') 1087 1088 def test_group_name_in_exception(self): 1089 # Issue 17341: Poor error message when compiling invalid regex 1090 with self.assertRaisesRegexp(sre_constants.error, '\?foo'): 1091 re.compile('(?P<?foo>)') 1092 1093 def test_issue17998(self): 1094 for reps in '*', '+', '?', '{1}': 1095 for mod in '', '?': 1096 pattern = '.' + reps + mod + 'yz' 1097 self.assertEqual(re.compile(pattern, re.S).findall('xyz'), 1098 ['xyz'], msg=pattern) 1099 if have_unicode: 1100 pattern = unicode(pattern) 1101 self.assertEqual(re.compile(pattern, re.S).findall(u'xyz'), 1102 [u'xyz'], msg=pattern) 1103 1104 1105 def test_bug_2537(self): 1106 # issue 2537: empty submatches 1107 for outer_op in ('{0,}', '*', '+', '{1,187}'): 1108 for inner_op in ('{0,}', '*', '?'): 1109 r = re.compile("^((x|y)%s)%s" % (inner_op, outer_op)) 1110 m = r.match("xyyzy") 1111 self.assertEqual(m.group(0), "xyy") 1112 self.assertEqual(m.group(1), "") 1113 self.assertEqual(m.group(2), "y") 1114 1115 def test_debug_flag(self): 1116 pat = r'(\.)(?:[ch]|py)(?(1)$|: )' 1117 with captured_stdout() as out: 1118 re.compile(pat, re.DEBUG) 1119 dump = '''\ 1120subpattern 1 1121 literal 46 1122subpattern None 1123 branch 1124 in 1125 literal 99 1126 literal 104 1127 or 1128 literal 112 1129 literal 121 1130subpattern None 1131 groupref_exists 1 1132 at at_end 1133 else 1134 literal 58 1135 literal 32 1136''' 1137 self.assertEqual(out.getvalue(), dump) 1138 # Debug output is output again even a second time (bypassing 1139 # the cache -- issue #20426). 1140 with captured_stdout() as out: 1141 re.compile(pat, re.DEBUG) 1142 self.assertEqual(out.getvalue(), dump) 1143 1144 def test_keyword_parameters(self): 1145 # Issue #20283: Accepting the string keyword parameter. 1146 pat = re.compile(r'(ab)') 1147 self.assertEqual( 1148 pat.match(string='abracadabra', pos=7, endpos=10).span(), (7, 9)) 1149 self.assertEqual( 1150 pat.search(string='abracadabra', pos=3, endpos=10).span(), (7, 9)) 1151 self.assertEqual( 1152 pat.findall(string='abracadabra', pos=3, endpos=10), ['ab']) 1153 self.assertEqual( 1154 pat.split(string='abracadabra', maxsplit=1), 1155 ['', 'ab', 'racadabra']) 1156 1157 def test_match_group_takes_long(self): 1158 self.assertEqual(re.match("(foo)", "foo").group(1L), "foo") 1159 self.assertRaises(IndexError, re.match("", "").group, sys.maxint + 1) 1160 1161 def test_locale_caching(self): 1162 # Issue #22410 1163 oldlocale = locale.setlocale(locale.LC_CTYPE) 1164 self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale) 1165 for loc in 'en_US.iso88591', 'en_US.utf8': 1166 try: 1167 locale.setlocale(locale.LC_CTYPE, loc) 1168 except locale.Error: 1169 # Unsupported locale on this system 1170 self.skipTest('test needs %s locale' % loc) 1171 1172 re.purge() 1173 self.check_en_US_iso88591() 1174 self.check_en_US_utf8() 1175 re.purge() 1176 self.check_en_US_utf8() 1177 self.check_en_US_iso88591() 1178 1179 def check_en_US_iso88591(self): 1180 locale.setlocale(locale.LC_CTYPE, 'en_US.iso88591') 1181 self.assertTrue(re.match(b'\xc5\xe5', b'\xc5\xe5', re.L|re.I)) 1182 self.assertTrue(re.match(b'\xc5', b'\xe5', re.L|re.I)) 1183 self.assertTrue(re.match(b'\xe5', b'\xc5', re.L|re.I)) 1184 self.assertTrue(re.match(b'(?Li)\xc5\xe5', b'\xc5\xe5')) 1185 self.assertTrue(re.match(b'(?Li)\xc5', b'\xe5')) 1186 self.assertTrue(re.match(b'(?Li)\xe5', b'\xc5')) 1187 1188 def check_en_US_utf8(self): 1189 locale.setlocale(locale.LC_CTYPE, 'en_US.utf8') 1190 self.assertTrue(re.match(b'\xc5\xe5', b'\xc5\xe5', re.L|re.I)) 1191 self.assertIsNone(re.match(b'\xc5', b'\xe5', re.L|re.I)) 1192 self.assertIsNone(re.match(b'\xe5', b'\xc5', re.L|re.I)) 1193 self.assertTrue(re.match(b'(?Li)\xc5\xe5', b'\xc5\xe5')) 1194 self.assertIsNone(re.match(b'(?Li)\xc5', b'\xe5')) 1195 self.assertIsNone(re.match(b'(?Li)\xe5', b'\xc5')) 1196 1197 1198def run_re_tests(): 1199 from test.re_tests import tests, SUCCEED, FAIL, SYNTAX_ERROR 1200 if verbose: 1201 print 'Running re_tests test suite' 1202 else: 1203 # To save time, only run the first and last 10 tests 1204 #tests = tests[:10] + tests[-10:] 1205 pass 1206 1207 for t in tests: 1208 sys.stdout.flush() 1209 pattern = s = outcome = repl = expected = None 1210 if len(t) == 5: 1211 pattern, s, outcome, repl, expected = t 1212 elif len(t) == 3: 1213 pattern, s, outcome = t 1214 else: 1215 raise ValueError, ('Test tuples should have 3 or 5 fields', t) 1216 1217 try: 1218 obj = re.compile(pattern) 1219 except re.error: 1220 if outcome == SYNTAX_ERROR: pass # Expected a syntax error 1221 else: 1222 print '=== Syntax error:', t 1223 except KeyboardInterrupt: raise KeyboardInterrupt 1224 except: 1225 print '*** Unexpected error ***', t 1226 if verbose: 1227 traceback.print_exc(file=sys.stdout) 1228 else: 1229 try: 1230 result = obj.search(s) 1231 except re.error, msg: 1232 print '=== Unexpected exception', t, repr(msg) 1233 if outcome == SYNTAX_ERROR: 1234 # This should have been a syntax error; forget it. 1235 pass 1236 elif outcome == FAIL: 1237 if result is None: pass # No match, as expected 1238 else: print '=== Succeeded incorrectly', t 1239 elif outcome == SUCCEED: 1240 if result is not None: 1241 # Matched, as expected, so now we compute the 1242 # result string and compare it to our expected result. 1243 start, end = result.span(0) 1244 vardict={'found': result.group(0), 1245 'groups': result.group(), 1246 'flags': result.re.flags} 1247 for i in range(1, 100): 1248 try: 1249 gi = result.group(i) 1250 # Special hack because else the string concat fails: 1251 if gi is None: 1252 gi = "None" 1253 except IndexError: 1254 gi = "Error" 1255 vardict['g%d' % i] = gi 1256 for i in result.re.groupindex.keys(): 1257 try: 1258 gi = result.group(i) 1259 if gi is None: 1260 gi = "None" 1261 except IndexError: 1262 gi = "Error" 1263 vardict[i] = gi 1264 repl = eval(repl, vardict) 1265 if repl != expected: 1266 print '=== grouping error', t, 1267 print repr(repl) + ' should be ' + repr(expected) 1268 else: 1269 print '=== Failed incorrectly', t 1270 1271 # Try the match on a unicode string, and check that it 1272 # still succeeds. 1273 try: 1274 result = obj.search(unicode(s, "latin-1")) 1275 if result is None: 1276 print '=== Fails on unicode match', t 1277 except NameError: 1278 continue # 1.5.2 1279 except TypeError: 1280 continue # unicode test case 1281 1282 # Try the match on a unicode pattern, and check that it 1283 # still succeeds. 1284 obj=re.compile(unicode(pattern, "latin-1")) 1285 result = obj.search(s) 1286 if result is None: 1287 print '=== Fails on unicode pattern match', t 1288 1289 # Try the match with the search area limited to the extent 1290 # of the match and see if it still succeeds. \B will 1291 # break (because it won't match at the end or start of a 1292 # string), so we'll ignore patterns that feature it. 1293 1294 if pattern[:2] != '\\B' and pattern[-2:] != '\\B' \ 1295 and result is not None: 1296 obj = re.compile(pattern) 1297 result = obj.search(s, result.start(0), result.end(0) + 1) 1298 if result is None: 1299 print '=== Failed on range-limited match', t 1300 1301 # Try the match with IGNORECASE enabled, and check that it 1302 # still succeeds. 1303 obj = re.compile(pattern, re.IGNORECASE) 1304 result = obj.search(s) 1305 if result is None: 1306 print '=== Fails on case-insensitive match', t 1307 1308 # Try the match with LOCALE enabled, and check that it 1309 # still succeeds. 1310 obj = re.compile(pattern, re.LOCALE) 1311 result = obj.search(s) 1312 if result is None: 1313 print '=== Fails on locale-sensitive match', t 1314 1315 # Try the match with UNICODE locale enabled, and check 1316 # that it still succeeds. 1317 obj = re.compile(pattern, re.UNICODE) 1318 result = obj.search(s) 1319 if result is None: 1320 print '=== Fails on unicode-sensitive match', t 1321 1322def test_main(): 1323 run_unittest(ReTests) 1324 deprecations = [ 1325 ('bad escape', DeprecationWarning), 1326 ] 1327 with check_py3k_warnings(*deprecations): 1328 run_re_tests() 1329 1330if __name__ == "__main__": 1331 test_main() 1332