1from test.support import (gc_collect, bigmemtest, _2G, 2 cpython_only, captured_stdout, 3 check_disallow_instantiation) 4import locale 5import re 6import sre_compile 7import string 8import unittest 9import warnings 10from re import Scanner 11from weakref import proxy 12 13# Misc tests from Tim Peters' re.doc 14 15# WARNING: Don't change details in these tests if you don't know 16# what you're doing. Some of these tests were carefully modeled to 17# cover most of the code. 18 19class S(str): 20 def __getitem__(self, index): 21 return S(super().__getitem__(index)) 22 23class B(bytes): 24 def __getitem__(self, index): 25 return B(super().__getitem__(index)) 26 27class ReTests(unittest.TestCase): 28 29 def assertTypedEqual(self, actual, expect, msg=None): 30 self.assertEqual(actual, expect, msg) 31 def recurse(actual, expect): 32 if isinstance(expect, (tuple, list)): 33 for x, y in zip(actual, expect): 34 recurse(x, y) 35 else: 36 self.assertIs(type(actual), type(expect), msg) 37 recurse(actual, expect) 38 39 def checkPatternError(self, pattern, errmsg, pos=None): 40 with self.assertRaises(re.error) as cm: 41 re.compile(pattern) 42 with self.subTest(pattern=pattern): 43 err = cm.exception 44 self.assertEqual(err.msg, errmsg) 45 if pos is not None: 46 self.assertEqual(err.pos, pos) 47 48 def checkTemplateError(self, pattern, repl, string, errmsg, pos=None): 49 with self.assertRaises(re.error) as cm: 50 re.sub(pattern, repl, string) 51 with self.subTest(pattern=pattern, repl=repl): 52 err = cm.exception 53 self.assertEqual(err.msg, errmsg) 54 if pos is not None: 55 self.assertEqual(err.pos, pos) 56 57 def test_keep_buffer(self): 58 # See bug 14212 59 b = bytearray(b'x') 60 it = re.finditer(b'a', b) 61 with self.assertRaises(BufferError): 62 b.extend(b'x'*400) 63 list(it) 64 del it 65 gc_collect() 66 b.extend(b'x'*400) 67 68 def test_weakref(self): 69 s = 'QabbbcR' 70 x = re.compile('ab+c') 71 y = proxy(x) 72 self.assertEqual(x.findall('QabbbcR'), y.findall('QabbbcR')) 73 74 def test_search_star_plus(self): 75 self.assertEqual(re.search('x*', 'axx').span(0), (0, 0)) 76 self.assertEqual(re.search('x*', 'axx').span(), (0, 0)) 77 self.assertEqual(re.search('x+', 'axx').span(0), (1, 3)) 78 self.assertEqual(re.search('x+', 'axx').span(), (1, 3)) 79 self.assertIsNone(re.search('x', 'aaa')) 80 self.assertEqual(re.match('a*', 'xxx').span(0), (0, 0)) 81 self.assertEqual(re.match('a*', 'xxx').span(), (0, 0)) 82 self.assertEqual(re.match('x*', 'xxxa').span(0), (0, 3)) 83 self.assertEqual(re.match('x*', 'xxxa').span(), (0, 3)) 84 self.assertIsNone(re.match('a+', 'xxx')) 85 86 def bump_num(self, matchobj): 87 int_value = int(matchobj.group(0)) 88 return str(int_value + 1) 89 90 def test_basic_re_sub(self): 91 self.assertTypedEqual(re.sub('y', 'a', 'xyz'), 'xaz') 92 self.assertTypedEqual(re.sub('y', S('a'), S('xyz')), 'xaz') 93 self.assertTypedEqual(re.sub(b'y', b'a', b'xyz'), b'xaz') 94 self.assertTypedEqual(re.sub(b'y', B(b'a'), B(b'xyz')), b'xaz') 95 self.assertTypedEqual(re.sub(b'y', bytearray(b'a'), bytearray(b'xyz')), b'xaz') 96 self.assertTypedEqual(re.sub(b'y', memoryview(b'a'), memoryview(b'xyz')), b'xaz') 97 for y in ("\xe0", "\u0430", "\U0001d49c"): 98 self.assertEqual(re.sub(y, 'a', 'x%sz' % y), 'xaz') 99 100 self.assertEqual(re.sub("(?i)b+", "x", "bbbb BBBB"), 'x x') 101 self.assertEqual(re.sub(r'\d+', self.bump_num, '08.2 -2 23x99y'), 102 '9.3 -3 24x100y') 103 self.assertEqual(re.sub(r'\d+', self.bump_num, '08.2 -2 23x99y', 3), 104 '9.3 -3 23x99y') 105 self.assertEqual(re.sub(r'\d+', self.bump_num, '08.2 -2 23x99y', count=3), 106 '9.3 -3 23x99y') 107 108 self.assertEqual(re.sub('.', lambda m: r"\n", 'x'), '\\n') 109 self.assertEqual(re.sub('.', r"\n", 'x'), '\n') 110 111 s = r"\1\1" 112 self.assertEqual(re.sub('(.)', s, 'x'), 'xx') 113 self.assertEqual(re.sub('(.)', s.replace('\\', r'\\'), 'x'), s) 114 self.assertEqual(re.sub('(.)', lambda m: s, 'x'), s) 115 116 self.assertEqual(re.sub('(?P<a>x)', r'\g<a>\g<a>', 'xx'), 'xxxx') 117 self.assertEqual(re.sub('(?P<a>x)', r'\g<a>\g<1>', 'xx'), 'xxxx') 118 self.assertEqual(re.sub('(?P<unk>x)', r'\g<unk>\g<unk>', 'xx'), 'xxxx') 119 self.assertEqual(re.sub('(?P<unk>x)', r'\g<1>\g<1>', 'xx'), 'xxxx') 120 121 self.assertEqual(re.sub('a', r'\t\n\v\r\f\a\b', 'a'), '\t\n\v\r\f\a\b') 122 self.assertEqual(re.sub('a', '\t\n\v\r\f\a\b', 'a'), '\t\n\v\r\f\a\b') 123 self.assertEqual(re.sub('a', '\t\n\v\r\f\a\b', 'a'), 124 (chr(9)+chr(10)+chr(11)+chr(13)+chr(12)+chr(7)+chr(8))) 125 for c in 'cdehijklmopqsuwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ': 126 with self.subTest(c): 127 with self.assertRaises(re.error): 128 self.assertEqual(re.sub('a', '\\' + c, 'a'), '\\' + c) 129 130 self.assertEqual(re.sub(r'^\s*', 'X', 'test'), 'Xtest') 131 132 def test_bug_449964(self): 133 # fails for group followed by other escape 134 self.assertEqual(re.sub(r'(?P<unk>x)', r'\g<1>\g<1>\b', 'xx'), 135 'xx\bxx\b') 136 137 def test_bug_449000(self): 138 # Test for sub() on escaped characters 139 self.assertEqual(re.sub(r'\r\n', r'\n', 'abc\r\ndef\r\n'), 140 'abc\ndef\n') 141 self.assertEqual(re.sub('\r\n', r'\n', 'abc\r\ndef\r\n'), 142 'abc\ndef\n') 143 self.assertEqual(re.sub(r'\r\n', '\n', 'abc\r\ndef\r\n'), 144 'abc\ndef\n') 145 self.assertEqual(re.sub('\r\n', '\n', 'abc\r\ndef\r\n'), 146 'abc\ndef\n') 147 148 def test_bug_1661(self): 149 # Verify that flags do not get silently ignored with compiled patterns 150 pattern = re.compile('.') 151 self.assertRaises(ValueError, re.match, pattern, 'A', re.I) 152 self.assertRaises(ValueError, re.search, pattern, 'A', re.I) 153 self.assertRaises(ValueError, re.findall, pattern, 'A', re.I) 154 self.assertRaises(ValueError, re.compile, pattern, re.I) 155 156 def test_bug_3629(self): 157 # A regex that triggered a bug in the sre-code validator 158 re.compile("(?P<quote>)(?(quote))") 159 160 def test_sub_template_numeric_escape(self): 161 # bug 776311 and friends 162 self.assertEqual(re.sub('x', r'\0', 'x'), '\0') 163 self.assertEqual(re.sub('x', r'\000', 'x'), '\000') 164 self.assertEqual(re.sub('x', r'\001', 'x'), '\001') 165 self.assertEqual(re.sub('x', r'\008', 'x'), '\0' + '8') 166 self.assertEqual(re.sub('x', r'\009', 'x'), '\0' + '9') 167 self.assertEqual(re.sub('x', r'\111', 'x'), '\111') 168 self.assertEqual(re.sub('x', r'\117', 'x'), '\117') 169 self.assertEqual(re.sub('x', r'\377', 'x'), '\377') 170 171 self.assertEqual(re.sub('x', r'\1111', 'x'), '\1111') 172 self.assertEqual(re.sub('x', r'\1111', 'x'), '\111' + '1') 173 174 self.assertEqual(re.sub('x', r'\00', 'x'), '\x00') 175 self.assertEqual(re.sub('x', r'\07', 'x'), '\x07') 176 self.assertEqual(re.sub('x', r'\08', 'x'), '\0' + '8') 177 self.assertEqual(re.sub('x', r'\09', 'x'), '\0' + '9') 178 self.assertEqual(re.sub('x', r'\0a', 'x'), '\0' + 'a') 179 180 self.checkTemplateError('x', r'\400', 'x', 181 r'octal escape value \400 outside of ' 182 r'range 0-0o377', 0) 183 self.checkTemplateError('x', r'\777', 'x', 184 r'octal escape value \777 outside of ' 185 r'range 0-0o377', 0) 186 187 self.checkTemplateError('x', r'\1', 'x', 'invalid group reference 1', 1) 188 self.checkTemplateError('x', r'\8', 'x', 'invalid group reference 8', 1) 189 self.checkTemplateError('x', r'\9', 'x', 'invalid group reference 9', 1) 190 self.checkTemplateError('x', r'\11', 'x', 'invalid group reference 11', 1) 191 self.checkTemplateError('x', r'\18', 'x', 'invalid group reference 18', 1) 192 self.checkTemplateError('x', r'\1a', 'x', 'invalid group reference 1', 1) 193 self.checkTemplateError('x', r'\90', 'x', 'invalid group reference 90', 1) 194 self.checkTemplateError('x', r'\99', 'x', 'invalid group reference 99', 1) 195 self.checkTemplateError('x', r'\118', 'x', 'invalid group reference 11', 1) 196 self.checkTemplateError('x', r'\11a', 'x', 'invalid group reference 11', 1) 197 self.checkTemplateError('x', r'\181', 'x', 'invalid group reference 18', 1) 198 self.checkTemplateError('x', r'\800', 'x', 'invalid group reference 80', 1) 199 self.checkTemplateError('x', r'\8', '', 'invalid group reference 8', 1) 200 201 # in python2.3 (etc), these loop endlessly in sre_parser.py 202 self.assertEqual(re.sub('(((((((((((x)))))))))))', r'\11', 'x'), 'x') 203 self.assertEqual(re.sub('((((((((((y))))))))))(.)', r'\118', 'xyz'), 204 'xz8') 205 self.assertEqual(re.sub('((((((((((y))))))))))(.)', r'\11a', 'xyz'), 206 'xza') 207 208 def test_qualified_re_sub(self): 209 self.assertEqual(re.sub('a', 'b', 'aaaaa'), 'bbbbb') 210 self.assertEqual(re.sub('a', 'b', 'aaaaa', 1), 'baaaa') 211 self.assertEqual(re.sub('a', 'b', 'aaaaa', count=1), 'baaaa') 212 213 def test_bug_114660(self): 214 self.assertEqual(re.sub(r'(\S)\s+(\S)', r'\1 \2', 'hello there'), 215 'hello there') 216 217 def test_symbolic_groups(self): 218 re.compile(r'(?P<a>x)(?P=a)(?(a)y)') 219 re.compile(r'(?P<a1>x)(?P=a1)(?(a1)y)') 220 re.compile(r'(?P<a1>x)\1(?(1)y)') 221 self.checkPatternError(r'(?P<a>)(?P<a>)', 222 "redefinition of group name 'a' as group 2; " 223 "was group 1") 224 self.checkPatternError(r'(?P<a>(?P=a))', 225 "cannot refer to an open group", 10) 226 self.checkPatternError(r'(?Pxy)', 'unknown extension ?Px') 227 self.checkPatternError(r'(?P<a>)(?P=a', 'missing ), unterminated name', 11) 228 self.checkPatternError(r'(?P=', 'missing group name', 4) 229 self.checkPatternError(r'(?P=)', 'missing group name', 4) 230 self.checkPatternError(r'(?P=1)', "bad character in group name '1'", 4) 231 self.checkPatternError(r'(?P=a)', "unknown group name 'a'") 232 self.checkPatternError(r'(?P=a1)', "unknown group name 'a1'") 233 self.checkPatternError(r'(?P=a.)', "bad character in group name 'a.'", 4) 234 self.checkPatternError(r'(?P<)', 'missing >, unterminated name', 4) 235 self.checkPatternError(r'(?P<a', 'missing >, unterminated name', 4) 236 self.checkPatternError(r'(?P<', 'missing group name', 4) 237 self.checkPatternError(r'(?P<>)', 'missing group name', 4) 238 self.checkPatternError(r'(?P<1>)', "bad character in group name '1'", 4) 239 self.checkPatternError(r'(?P<a.>)', "bad character in group name 'a.'", 4) 240 self.checkPatternError(r'(?(', 'missing group name', 3) 241 self.checkPatternError(r'(?())', 'missing group name', 3) 242 self.checkPatternError(r'(?(a))', "unknown group name 'a'", 3) 243 self.checkPatternError(r'(?(-1))', "bad character in group name '-1'", 3) 244 self.checkPatternError(r'(?(1a))', "bad character in group name '1a'", 3) 245 self.checkPatternError(r'(?(a.))', "bad character in group name 'a.'", 3) 246 # New valid/invalid identifiers in Python 3 247 re.compile('(?P<µ>x)(?P=µ)(?(µ)y)') 248 re.compile('(?P<>x)(?P=)(?()y)') 249 self.checkPatternError('(?P<©>x)', "bad character in group name '©'", 4) 250 # Support > 100 groups. 251 pat = '|'.join('x(?P<a%d>%x)y' % (i, i) for i in range(1, 200 + 1)) 252 pat = '(?:%s)(?(200)z|t)' % pat 253 self.assertEqual(re.match(pat, 'xc8yz').span(), (0, 5)) 254 255 def test_symbolic_refs(self): 256 self.checkTemplateError('(?P<a>x)', r'\g<a', 'xx', 257 'missing >, unterminated name', 3) 258 self.checkTemplateError('(?P<a>x)', r'\g<', 'xx', 259 'missing group name', 3) 260 self.checkTemplateError('(?P<a>x)', r'\g', 'xx', 'missing <', 2) 261 self.checkTemplateError('(?P<a>x)', r'\g<a a>', 'xx', 262 "bad character in group name 'a a'", 3) 263 self.checkTemplateError('(?P<a>x)', r'\g<>', 'xx', 264 'missing group name', 3) 265 self.checkTemplateError('(?P<a>x)', r'\g<1a1>', 'xx', 266 "bad character in group name '1a1'", 3) 267 self.checkTemplateError('(?P<a>x)', r'\g<2>', 'xx', 268 'invalid group reference 2', 3) 269 self.checkTemplateError('(?P<a>x)', r'\2', 'xx', 270 'invalid group reference 2', 1) 271 with self.assertRaisesRegex(IndexError, "unknown group name 'ab'"): 272 re.sub('(?P<a>x)', r'\g<ab>', 'xx') 273 self.assertEqual(re.sub('(?P<a>x)|(?P<b>y)', r'\g<b>', 'xx'), '') 274 self.assertEqual(re.sub('(?P<a>x)|(?P<b>y)', r'\2', 'xx'), '') 275 self.checkTemplateError('(?P<a>x)', r'\g<-1>', 'xx', 276 "bad character in group name '-1'", 3) 277 # New valid/invalid identifiers in Python 3 278 self.assertEqual(re.sub('(?P<µ>x)', r'\g<µ>', 'xx'), 'xx') 279 self.assertEqual(re.sub('(?P<>x)', r'\g<>', 'xx'), 'xx') 280 self.checkTemplateError('(?P<a>x)', r'\g<©>', 'xx', 281 "bad character in group name '©'", 3) 282 # Support > 100 groups. 283 pat = '|'.join('x(?P<a%d>%x)y' % (i, i) for i in range(1, 200 + 1)) 284 self.assertEqual(re.sub(pat, r'\g<200>', 'xc8yzxc8y'), 'c8zc8') 285 286 def test_re_subn(self): 287 self.assertEqual(re.subn("(?i)b+", "x", "bbbb BBBB"), ('x x', 2)) 288 self.assertEqual(re.subn("b+", "x", "bbbb BBBB"), ('x BBBB', 1)) 289 self.assertEqual(re.subn("b+", "x", "xyz"), ('xyz', 0)) 290 self.assertEqual(re.subn("b*", "x", "xyz"), ('xxxyxzx', 4)) 291 self.assertEqual(re.subn("b*", "x", "xyz", 2), ('xxxyz', 2)) 292 self.assertEqual(re.subn("b*", "x", "xyz", count=2), ('xxxyz', 2)) 293 294 def test_re_split(self): 295 for string in ":a:b::c", S(":a:b::c"): 296 self.assertTypedEqual(re.split(":", string), 297 ['', 'a', 'b', '', 'c']) 298 self.assertTypedEqual(re.split(":+", string), 299 ['', 'a', 'b', 'c']) 300 self.assertTypedEqual(re.split("(:+)", string), 301 ['', ':', 'a', ':', 'b', '::', 'c']) 302 for string in (b":a:b::c", B(b":a:b::c"), bytearray(b":a:b::c"), 303 memoryview(b":a:b::c")): 304 self.assertTypedEqual(re.split(b":", string), 305 [b'', b'a', b'b', b'', b'c']) 306 self.assertTypedEqual(re.split(b":+", string), 307 [b'', b'a', b'b', b'c']) 308 self.assertTypedEqual(re.split(b"(:+)", string), 309 [b'', b':', b'a', b':', b'b', b'::', b'c']) 310 for a, b, c in ("\xe0\xdf\xe7", "\u0430\u0431\u0432", 311 "\U0001d49c\U0001d49e\U0001d4b5"): 312 string = ":%s:%s::%s" % (a, b, c) 313 self.assertEqual(re.split(":", string), ['', a, b, '', c]) 314 self.assertEqual(re.split(":+", string), ['', a, b, c]) 315 self.assertEqual(re.split("(:+)", string), 316 ['', ':', a, ':', b, '::', c]) 317 318 self.assertEqual(re.split("(?::+)", ":a:b::c"), ['', 'a', 'b', 'c']) 319 self.assertEqual(re.split("(:)+", ":a:b::c"), 320 ['', ':', 'a', ':', 'b', ':', 'c']) 321 self.assertEqual(re.split("([b:]+)", ":a:b::c"), 322 ['', ':', 'a', ':b::', 'c']) 323 self.assertEqual(re.split("(b)|(:+)", ":a:b::c"), 324 ['', None, ':', 'a', None, ':', '', 'b', None, '', 325 None, '::', 'c']) 326 self.assertEqual(re.split("(?:b)|(?::+)", ":a:b::c"), 327 ['', 'a', '', '', 'c']) 328 329 for sep, expected in [ 330 (':*', ['', '', 'a', '', 'b', '', 'c', '']), 331 ('(?::*)', ['', '', 'a', '', 'b', '', 'c', '']), 332 ('(:*)', ['', ':', '', '', 'a', ':', '', '', 'b', '::', '', '', 'c', '', '']), 333 ('(:)*', ['', ':', '', None, 'a', ':', '', None, 'b', ':', '', None, 'c', None, '']), 334 ]: 335 with self.subTest(sep=sep): 336 self.assertTypedEqual(re.split(sep, ':a:b::c'), expected) 337 338 for sep, expected in [ 339 ('', ['', ':', 'a', ':', 'b', ':', ':', 'c', '']), 340 (r'\b', [':', 'a', ':', 'b', '::', 'c', '']), 341 (r'(?=:)', ['', ':a', ':b', ':', ':c']), 342 (r'(?<=:)', [':', 'a:', 'b:', ':', 'c']), 343 ]: 344 with self.subTest(sep=sep): 345 self.assertTypedEqual(re.split(sep, ':a:b::c'), expected) 346 347 def test_qualified_re_split(self): 348 self.assertEqual(re.split(":", ":a:b::c", 2), ['', 'a', 'b::c']) 349 self.assertEqual(re.split(":", ":a:b::c", maxsplit=2), ['', 'a', 'b::c']) 350 self.assertEqual(re.split(':', 'a:b:c:d', maxsplit=2), ['a', 'b', 'c:d']) 351 self.assertEqual(re.split("(:)", ":a:b::c", maxsplit=2), 352 ['', ':', 'a', ':', 'b::c']) 353 self.assertEqual(re.split("(:+)", ":a:b::c", maxsplit=2), 354 ['', ':', 'a', ':', 'b::c']) 355 self.assertEqual(re.split("(:*)", ":a:b::c", maxsplit=2), 356 ['', ':', '', '', 'a:b::c']) 357 358 def test_re_findall(self): 359 self.assertEqual(re.findall(":+", "abc"), []) 360 for string in "a:b::c:::d", S("a:b::c:::d"): 361 self.assertTypedEqual(re.findall(":+", string), 362 [":", "::", ":::"]) 363 self.assertTypedEqual(re.findall("(:+)", string), 364 [":", "::", ":::"]) 365 self.assertTypedEqual(re.findall("(:)(:*)", string), 366 [(":", ""), (":", ":"), (":", "::")]) 367 for string in (b"a:b::c:::d", B(b"a:b::c:::d"), bytearray(b"a:b::c:::d"), 368 memoryview(b"a:b::c:::d")): 369 self.assertTypedEqual(re.findall(b":+", string), 370 [b":", b"::", b":::"]) 371 self.assertTypedEqual(re.findall(b"(:+)", string), 372 [b":", b"::", b":::"]) 373 self.assertTypedEqual(re.findall(b"(:)(:*)", string), 374 [(b":", b""), (b":", b":"), (b":", b"::")]) 375 for x in ("\xe0", "\u0430", "\U0001d49c"): 376 xx = x * 2 377 xxx = x * 3 378 string = "a%sb%sc%sd" % (x, xx, xxx) 379 self.assertEqual(re.findall("%s+" % x, string), [x, xx, xxx]) 380 self.assertEqual(re.findall("(%s+)" % x, string), [x, xx, xxx]) 381 self.assertEqual(re.findall("(%s)(%s*)" % (x, x), string), 382 [(x, ""), (x, x), (x, xx)]) 383 384 def test_bug_117612(self): 385 self.assertEqual(re.findall(r"(a|(b))", "aba"), 386 [("a", ""),("b", "b"),("a", "")]) 387 388 def test_re_match(self): 389 for string in 'a', S('a'): 390 self.assertEqual(re.match('a', string).groups(), ()) 391 self.assertEqual(re.match('(a)', string).groups(), ('a',)) 392 self.assertEqual(re.match('(a)', string).group(0), 'a') 393 self.assertEqual(re.match('(a)', string).group(1), 'a') 394 self.assertEqual(re.match('(a)', string).group(1, 1), ('a', 'a')) 395 for string in b'a', B(b'a'), bytearray(b'a'), memoryview(b'a'): 396 self.assertEqual(re.match(b'a', string).groups(), ()) 397 self.assertEqual(re.match(b'(a)', string).groups(), (b'a',)) 398 self.assertEqual(re.match(b'(a)', string).group(0), b'a') 399 self.assertEqual(re.match(b'(a)', string).group(1), b'a') 400 self.assertEqual(re.match(b'(a)', string).group(1, 1), (b'a', b'a')) 401 for a in ("\xe0", "\u0430", "\U0001d49c"): 402 self.assertEqual(re.match(a, a).groups(), ()) 403 self.assertEqual(re.match('(%s)' % a, a).groups(), (a,)) 404 self.assertEqual(re.match('(%s)' % a, a).group(0), a) 405 self.assertEqual(re.match('(%s)' % a, a).group(1), a) 406 self.assertEqual(re.match('(%s)' % a, a).group(1, 1), (a, a)) 407 408 pat = re.compile('((a)|(b))(c)?') 409 self.assertEqual(pat.match('a').groups(), ('a', 'a', None, None)) 410 self.assertEqual(pat.match('b').groups(), ('b', None, 'b', None)) 411 self.assertEqual(pat.match('ac').groups(), ('a', 'a', None, 'c')) 412 self.assertEqual(pat.match('bc').groups(), ('b', None, 'b', 'c')) 413 self.assertEqual(pat.match('bc').groups(""), ('b', "", 'b', 'c')) 414 415 pat = re.compile('(?:(?P<a1>a)|(?P<b2>b))(?P<c3>c)?') 416 self.assertEqual(pat.match('a').group(1, 2, 3), ('a', None, None)) 417 self.assertEqual(pat.match('b').group('a1', 'b2', 'c3'), 418 (None, 'b', None)) 419 self.assertEqual(pat.match('ac').group(1, 'b2', 3), ('a', None, 'c')) 420 421 def test_group(self): 422 class Index: 423 def __init__(self, value): 424 self.value = value 425 def __index__(self): 426 return self.value 427 # A single group 428 m = re.match('(a)(b)', 'ab') 429 self.assertEqual(m.group(), 'ab') 430 self.assertEqual(m.group(0), 'ab') 431 self.assertEqual(m.group(1), 'a') 432 self.assertEqual(m.group(Index(1)), 'a') 433 self.assertRaises(IndexError, m.group, -1) 434 self.assertRaises(IndexError, m.group, 3) 435 self.assertRaises(IndexError, m.group, 1<<1000) 436 self.assertRaises(IndexError, m.group, Index(1<<1000)) 437 self.assertRaises(IndexError, m.group, 'x') 438 # Multiple groups 439 self.assertEqual(m.group(2, 1), ('b', 'a')) 440 self.assertEqual(m.group(Index(2), Index(1)), ('b', 'a')) 441 442 def test_match_getitem(self): 443 pat = re.compile('(?:(?P<a1>a)|(?P<b2>b))(?P<c3>c)?') 444 445 m = pat.match('a') 446 self.assertEqual(m['a1'], 'a') 447 self.assertEqual(m['b2'], None) 448 self.assertEqual(m['c3'], None) 449 self.assertEqual('a1={a1} b2={b2} c3={c3}'.format_map(m), 'a1=a b2=None c3=None') 450 self.assertEqual(m[0], 'a') 451 self.assertEqual(m[1], 'a') 452 self.assertEqual(m[2], None) 453 self.assertEqual(m[3], None) 454 with self.assertRaisesRegex(IndexError, 'no such group'): 455 m['X'] 456 with self.assertRaisesRegex(IndexError, 'no such group'): 457 m[-1] 458 with self.assertRaisesRegex(IndexError, 'no such group'): 459 m[4] 460 with self.assertRaisesRegex(IndexError, 'no such group'): 461 m[0, 1] 462 with self.assertRaisesRegex(IndexError, 'no such group'): 463 m[(0,)] 464 with self.assertRaisesRegex(IndexError, 'no such group'): 465 m[(0, 1)] 466 with self.assertRaisesRegex(IndexError, 'no such group'): 467 'a1={a2}'.format_map(m) 468 469 m = pat.match('ac') 470 self.assertEqual(m['a1'], 'a') 471 self.assertEqual(m['b2'], None) 472 self.assertEqual(m['c3'], 'c') 473 self.assertEqual('a1={a1} b2={b2} c3={c3}'.format_map(m), 'a1=a b2=None c3=c') 474 self.assertEqual(m[0], 'ac') 475 self.assertEqual(m[1], 'a') 476 self.assertEqual(m[2], None) 477 self.assertEqual(m[3], 'c') 478 479 # Cannot assign. 480 with self.assertRaises(TypeError): 481 m[0] = 1 482 483 # No len(). 484 self.assertRaises(TypeError, len, m) 485 486 def test_re_fullmatch(self): 487 # Issue 16203: Proposal: add re.fullmatch() method. 488 self.assertEqual(re.fullmatch(r"a", "a").span(), (0, 1)) 489 for string in "ab", S("ab"): 490 self.assertEqual(re.fullmatch(r"a|ab", string).span(), (0, 2)) 491 for string in b"ab", B(b"ab"), bytearray(b"ab"), memoryview(b"ab"): 492 self.assertEqual(re.fullmatch(br"a|ab", string).span(), (0, 2)) 493 for a, b in "\xe0\xdf", "\u0430\u0431", "\U0001d49c\U0001d49e": 494 r = r"%s|%s" % (a, a + b) 495 self.assertEqual(re.fullmatch(r, a + b).span(), (0, 2)) 496 self.assertEqual(re.fullmatch(r".*?$", "abc").span(), (0, 3)) 497 self.assertEqual(re.fullmatch(r".*?", "abc").span(), (0, 3)) 498 self.assertEqual(re.fullmatch(r"a.*?b", "ab").span(), (0, 2)) 499 self.assertEqual(re.fullmatch(r"a.*?b", "abb").span(), (0, 3)) 500 self.assertEqual(re.fullmatch(r"a.*?b", "axxb").span(), (0, 4)) 501 self.assertIsNone(re.fullmatch(r"a+", "ab")) 502 self.assertIsNone(re.fullmatch(r"abc$", "abc\n")) 503 self.assertIsNone(re.fullmatch(r"abc\Z", "abc\n")) 504 self.assertIsNone(re.fullmatch(r"(?m)abc$", "abc\n")) 505 self.assertEqual(re.fullmatch(r"ab(?=c)cd", "abcd").span(), (0, 4)) 506 self.assertEqual(re.fullmatch(r"ab(?<=b)cd", "abcd").span(), (0, 4)) 507 self.assertEqual(re.fullmatch(r"(?=a|ab)ab", "ab").span(), (0, 2)) 508 509 self.assertEqual( 510 re.compile(r"bc").fullmatch("abcd", pos=1, endpos=3).span(), (1, 3)) 511 self.assertEqual( 512 re.compile(r".*?$").fullmatch("abcd", pos=1, endpos=3).span(), (1, 3)) 513 self.assertEqual( 514 re.compile(r".*?").fullmatch("abcd", pos=1, endpos=3).span(), (1, 3)) 515 516 def test_re_groupref_exists(self): 517 self.assertEqual(re.match(r'^(\()?([^()]+)(?(1)\))$', '(a)').groups(), 518 ('(', 'a')) 519 self.assertEqual(re.match(r'^(\()?([^()]+)(?(1)\))$', 'a').groups(), 520 (None, 'a')) 521 self.assertIsNone(re.match(r'^(\()?([^()]+)(?(1)\))$', 'a)')) 522 self.assertIsNone(re.match(r'^(\()?([^()]+)(?(1)\))$', '(a')) 523 self.assertEqual(re.match('^(?:(a)|c)((?(1)b|d))$', 'ab').groups(), 524 ('a', 'b')) 525 self.assertEqual(re.match(r'^(?:(a)|c)((?(1)b|d))$', 'cd').groups(), 526 (None, 'd')) 527 self.assertEqual(re.match(r'^(?:(a)|c)((?(1)|d))$', 'cd').groups(), 528 (None, 'd')) 529 self.assertEqual(re.match(r'^(?:(a)|c)((?(1)|d))$', 'a').groups(), 530 ('a', '')) 531 532 # Tests for bug #1177831: exercise groups other than the first group 533 p = re.compile('(?P<g1>a)(?P<g2>b)?((?(g2)c|d))') 534 self.assertEqual(p.match('abc').groups(), 535 ('a', 'b', 'c')) 536 self.assertEqual(p.match('ad').groups(), 537 ('a', None, 'd')) 538 self.assertIsNone(p.match('abd')) 539 self.assertIsNone(p.match('ac')) 540 541 # Support > 100 groups. 542 pat = '|'.join('x(?P<a%d>%x)y' % (i, i) for i in range(1, 200 + 1)) 543 pat = '(?:%s)(?(200)z)' % pat 544 self.assertEqual(re.match(pat, 'xc8yz').span(), (0, 5)) 545 546 self.checkPatternError(r'(?P<a>)(?(0))', 'bad group number', 10) 547 self.checkPatternError(r'()(?(1)a|b', 548 'missing ), unterminated subpattern', 2) 549 self.checkPatternError(r'()(?(1)a|b|c)', 550 'conditional backref with more than ' 551 'two branches', 10) 552 553 def test_re_groupref_overflow(self): 554 from sre_constants import MAXGROUPS 555 self.checkTemplateError('()', r'\g<%s>' % MAXGROUPS, 'xx', 556 'invalid group reference %d' % MAXGROUPS, 3) 557 self.checkPatternError(r'(?P<a>)(?(%d))' % MAXGROUPS, 558 'invalid group reference %d' % MAXGROUPS, 10) 559 560 def test_re_groupref(self): 561 self.assertEqual(re.match(r'^(\|)?([^()]+)\1$', '|a|').groups(), 562 ('|', 'a')) 563 self.assertEqual(re.match(r'^(\|)?([^()]+)\1?$', 'a').groups(), 564 (None, 'a')) 565 self.assertIsNone(re.match(r'^(\|)?([^()]+)\1$', 'a|')) 566 self.assertIsNone(re.match(r'^(\|)?([^()]+)\1$', '|a')) 567 self.assertEqual(re.match(r'^(?:(a)|c)(\1)$', 'aa').groups(), 568 ('a', 'a')) 569 self.assertEqual(re.match(r'^(?:(a)|c)(\1)?$', 'c').groups(), 570 (None, None)) 571 572 self.checkPatternError(r'(abc\1)', 'cannot refer to an open group', 4) 573 574 def test_groupdict(self): 575 self.assertEqual(re.match('(?P<first>first) (?P<second>second)', 576 'first second').groupdict(), 577 {'first':'first', 'second':'second'}) 578 579 def test_expand(self): 580 self.assertEqual(re.match("(?P<first>first) (?P<second>second)", 581 "first second") 582 .expand(r"\2 \1 \g<second> \g<first>"), 583 "second first second first") 584 self.assertEqual(re.match("(?P<first>first)|(?P<second>second)", 585 "first") 586 .expand(r"\2 \g<second>"), 587 " ") 588 589 def test_repeat_minmax(self): 590 self.assertIsNone(re.match(r"^(\w){1}$", "abc")) 591 self.assertIsNone(re.match(r"^(\w){1}?$", "abc")) 592 self.assertIsNone(re.match(r"^(\w){1,2}$", "abc")) 593 self.assertIsNone(re.match(r"^(\w){1,2}?$", "abc")) 594 595 self.assertEqual(re.match(r"^(\w){3}$", "abc").group(1), "c") 596 self.assertEqual(re.match(r"^(\w){1,3}$", "abc").group(1), "c") 597 self.assertEqual(re.match(r"^(\w){1,4}$", "abc").group(1), "c") 598 self.assertEqual(re.match(r"^(\w){3,4}?$", "abc").group(1), "c") 599 self.assertEqual(re.match(r"^(\w){3}?$", "abc").group(1), "c") 600 self.assertEqual(re.match(r"^(\w){1,3}?$", "abc").group(1), "c") 601 self.assertEqual(re.match(r"^(\w){1,4}?$", "abc").group(1), "c") 602 self.assertEqual(re.match(r"^(\w){3,4}?$", "abc").group(1), "c") 603 604 self.assertIsNone(re.match(r"^x{1}$", "xxx")) 605 self.assertIsNone(re.match(r"^x{1}?$", "xxx")) 606 self.assertIsNone(re.match(r"^x{1,2}$", "xxx")) 607 self.assertIsNone(re.match(r"^x{1,2}?$", "xxx")) 608 609 self.assertTrue(re.match(r"^x{3}$", "xxx")) 610 self.assertTrue(re.match(r"^x{1,3}$", "xxx")) 611 self.assertTrue(re.match(r"^x{3,3}$", "xxx")) 612 self.assertTrue(re.match(r"^x{1,4}$", "xxx")) 613 self.assertTrue(re.match(r"^x{3,4}?$", "xxx")) 614 self.assertTrue(re.match(r"^x{3}?$", "xxx")) 615 self.assertTrue(re.match(r"^x{1,3}?$", "xxx")) 616 self.assertTrue(re.match(r"^x{1,4}?$", "xxx")) 617 self.assertTrue(re.match(r"^x{3,4}?$", "xxx")) 618 619 self.assertIsNone(re.match(r"^x{}$", "xxx")) 620 self.assertTrue(re.match(r"^x{}$", "x{}")) 621 622 self.checkPatternError(r'x{2,1}', 623 'min repeat greater than max repeat', 2) 624 625 def test_getattr(self): 626 self.assertEqual(re.compile("(?i)(a)(b)").pattern, "(?i)(a)(b)") 627 self.assertEqual(re.compile("(?i)(a)(b)").flags, re.I | re.U) 628 self.assertEqual(re.compile("(?i)(a)(b)").groups, 2) 629 self.assertEqual(re.compile("(?i)(a)(b)").groupindex, {}) 630 self.assertEqual(re.compile("(?i)(?P<first>a)(?P<other>b)").groupindex, 631 {'first': 1, 'other': 2}) 632 633 self.assertEqual(re.match("(a)", "a").pos, 0) 634 self.assertEqual(re.match("(a)", "a").endpos, 1) 635 self.assertEqual(re.match("(a)", "a").string, "a") 636 self.assertEqual(re.match("(a)", "a").regs, ((0, 1), (0, 1))) 637 self.assertTrue(re.match("(a)", "a").re) 638 639 # Issue 14260. groupindex should be non-modifiable mapping. 640 p = re.compile(r'(?i)(?P<first>a)(?P<other>b)') 641 self.assertEqual(sorted(p.groupindex), ['first', 'other']) 642 self.assertEqual(p.groupindex['other'], 2) 643 with self.assertRaises(TypeError): 644 p.groupindex['other'] = 0 645 self.assertEqual(p.groupindex['other'], 2) 646 647 def test_special_escapes(self): 648 self.assertEqual(re.search(r"\b(b.)\b", 649 "abcd abc bcd bx").group(1), "bx") 650 self.assertEqual(re.search(r"\B(b.)\B", 651 "abc bcd bc abxd").group(1), "bx") 652 self.assertEqual(re.search(r"\b(b.)\b", 653 "abcd abc bcd bx", re.ASCII).group(1), "bx") 654 self.assertEqual(re.search(r"\B(b.)\B", 655 "abc bcd bc abxd", re.ASCII).group(1), "bx") 656 self.assertEqual(re.search(r"^abc$", "\nabc\n", re.M).group(0), "abc") 657 self.assertEqual(re.search(r"^\Aabc\Z$", "abc", re.M).group(0), "abc") 658 self.assertIsNone(re.search(r"^\Aabc\Z$", "\nabc\n", re.M)) 659 self.assertEqual(re.search(br"\b(b.)\b", 660 b"abcd abc bcd bx").group(1), b"bx") 661 self.assertEqual(re.search(br"\B(b.)\B", 662 b"abc bcd bc abxd").group(1), b"bx") 663 self.assertEqual(re.search(br"\b(b.)\b", 664 b"abcd abc bcd bx", re.LOCALE).group(1), b"bx") 665 self.assertEqual(re.search(br"\B(b.)\B", 666 b"abc bcd bc abxd", re.LOCALE).group(1), b"bx") 667 self.assertEqual(re.search(br"^abc$", b"\nabc\n", re.M).group(0), b"abc") 668 self.assertEqual(re.search(br"^\Aabc\Z$", b"abc", re.M).group(0), b"abc") 669 self.assertIsNone(re.search(br"^\Aabc\Z$", b"\nabc\n", re.M)) 670 self.assertEqual(re.search(r"\d\D\w\W\s\S", 671 "1aa! a").group(0), "1aa! a") 672 self.assertEqual(re.search(br"\d\D\w\W\s\S", 673 b"1aa! a").group(0), b"1aa! a") 674 self.assertEqual(re.search(r"\d\D\w\W\s\S", 675 "1aa! a", re.ASCII).group(0), "1aa! a") 676 self.assertEqual(re.search(br"\d\D\w\W\s\S", 677 b"1aa! a", re.LOCALE).group(0), b"1aa! a") 678 679 def test_other_escapes(self): 680 self.checkPatternError("\\", 'bad escape (end of pattern)', 0) 681 self.assertEqual(re.match(r"\(", '(').group(), '(') 682 self.assertIsNone(re.match(r"\(", ')')) 683 self.assertEqual(re.match(r"\\", '\\').group(), '\\') 684 self.assertEqual(re.match(r"[\]]", ']').group(), ']') 685 self.assertIsNone(re.match(r"[\]]", '[')) 686 self.assertEqual(re.match(r"[a\-c]", '-').group(), '-') 687 self.assertIsNone(re.match(r"[a\-c]", 'b')) 688 self.assertEqual(re.match(r"[\^a]+", 'a^').group(), 'a^') 689 self.assertIsNone(re.match(r"[\^a]+", 'b')) 690 re.purge() # for warnings 691 for c in 'ceghijklmopqyzCEFGHIJKLMNOPQRTVXY': 692 with self.subTest(c): 693 self.assertRaises(re.error, re.compile, '\\%c' % c) 694 for c in 'ceghijklmopqyzABCEFGHIJKLMNOPQRTVXYZ': 695 with self.subTest(c): 696 self.assertRaises(re.error, re.compile, '[\\%c]' % c) 697 698 def test_named_unicode_escapes(self): 699 # test individual Unicode named escapes 700 self.assertTrue(re.match(r'\N{LESS-THAN SIGN}', '<')) 701 self.assertTrue(re.match(r'\N{less-than sign}', '<')) 702 self.assertIsNone(re.match(r'\N{LESS-THAN SIGN}', '>')) 703 self.assertTrue(re.match(r'\N{SNAKE}', '\U0001f40d')) 704 self.assertTrue(re.match(r'\N{ARABIC LIGATURE UIGHUR KIRGHIZ YEH WITH ' 705 r'HAMZA ABOVE WITH ALEF MAKSURA ISOLATED FORM}', 706 '\ufbf9')) 707 self.assertTrue(re.match(r'[\N{LESS-THAN SIGN}-\N{GREATER-THAN SIGN}]', 708 '=')) 709 self.assertIsNone(re.match(r'[\N{LESS-THAN SIGN}-\N{GREATER-THAN SIGN}]', 710 ';')) 711 712 # test errors in \N{name} handling - only valid names should pass 713 self.checkPatternError(r'\N', 'missing {', 2) 714 self.checkPatternError(r'[\N]', 'missing {', 3) 715 self.checkPatternError(r'\N{', 'missing character name', 3) 716 self.checkPatternError(r'[\N{', 'missing character name', 4) 717 self.checkPatternError(r'\N{}', 'missing character name', 3) 718 self.checkPatternError(r'[\N{}]', 'missing character name', 4) 719 self.checkPatternError(r'\NSNAKE}', 'missing {', 2) 720 self.checkPatternError(r'[\NSNAKE}]', 'missing {', 3) 721 self.checkPatternError(r'\N{SNAKE', 722 'missing }, unterminated name', 3) 723 self.checkPatternError(r'[\N{SNAKE]', 724 'missing }, unterminated name', 4) 725 self.checkPatternError(r'[\N{SNAKE]}', 726 "undefined character name 'SNAKE]'", 1) 727 self.checkPatternError(r'\N{SPAM}', 728 "undefined character name 'SPAM'", 0) 729 self.checkPatternError(r'[\N{SPAM}]', 730 "undefined character name 'SPAM'", 1) 731 self.checkPatternError(br'\N{LESS-THAN SIGN}', r'bad escape \N', 0) 732 self.checkPatternError(br'[\N{LESS-THAN SIGN}]', r'bad escape \N', 1) 733 734 def test_string_boundaries(self): 735 # See http://bugs.python.org/issue10713 736 self.assertEqual(re.search(r"\b(abc)\b", "abc").group(1), 737 "abc") 738 # There's a word boundary at the start of a string. 739 self.assertTrue(re.match(r"\b", "abc")) 740 # A non-empty string includes a non-boundary zero-length match. 741 self.assertTrue(re.search(r"\B", "abc")) 742 # There is no non-boundary match at the start of a string. 743 self.assertFalse(re.match(r"\B", "abc")) 744 # However, an empty string contains no word boundaries, and also no 745 # non-boundaries. 746 self.assertIsNone(re.search(r"\B", "")) 747 # This one is questionable and different from the perlre behaviour, 748 # but describes current behavior. 749 self.assertIsNone(re.search(r"\b", "")) 750 # A single word-character string has two boundaries, but no 751 # non-boundary gaps. 752 self.assertEqual(len(re.findall(r"\b", "a")), 2) 753 self.assertEqual(len(re.findall(r"\B", "a")), 0) 754 # If there are no words, there are no boundaries 755 self.assertEqual(len(re.findall(r"\b", " ")), 0) 756 self.assertEqual(len(re.findall(r"\b", " ")), 0) 757 # Can match around the whitespace. 758 self.assertEqual(len(re.findall(r"\B", " ")), 2) 759 760 def test_bigcharset(self): 761 self.assertEqual(re.match("([\u2222\u2223])", 762 "\u2222").group(1), "\u2222") 763 r = '[%s]' % ''.join(map(chr, range(256, 2**16, 255))) 764 self.assertEqual(re.match(r, "\uff01").group(), "\uff01") 765 766 def test_big_codesize(self): 767 # Issue #1160 768 r = re.compile('|'.join(('%d'%x for x in range(10000)))) 769 self.assertTrue(r.match('1000')) 770 self.assertTrue(r.match('9999')) 771 772 def test_anyall(self): 773 self.assertEqual(re.match("a.b", "a\nb", re.DOTALL).group(0), 774 "a\nb") 775 self.assertEqual(re.match("a.*b", "a\n\nb", re.DOTALL).group(0), 776 "a\n\nb") 777 778 def test_lookahead(self): 779 self.assertEqual(re.match(r"(a(?=\s[^a]))", "a b").group(1), "a") 780 self.assertEqual(re.match(r"(a(?=\s[^a]*))", "a b").group(1), "a") 781 self.assertEqual(re.match(r"(a(?=\s[abc]))", "a b").group(1), "a") 782 self.assertEqual(re.match(r"(a(?=\s[abc]*))", "a bc").group(1), "a") 783 self.assertEqual(re.match(r"(a)(?=\s\1)", "a a").group(1), "a") 784 self.assertEqual(re.match(r"(a)(?=\s\1*)", "a aa").group(1), "a") 785 self.assertEqual(re.match(r"(a)(?=\s(abc|a))", "a a").group(1), "a") 786 787 self.assertEqual(re.match(r"(a(?!\s[^a]))", "a a").group(1), "a") 788 self.assertEqual(re.match(r"(a(?!\s[abc]))", "a d").group(1), "a") 789 self.assertEqual(re.match(r"(a)(?!\s\1)", "a b").group(1), "a") 790 self.assertEqual(re.match(r"(a)(?!\s(abc|a))", "a b").group(1), "a") 791 792 # Group reference. 793 self.assertTrue(re.match(r'(a)b(?=\1)a', 'aba')) 794 self.assertIsNone(re.match(r'(a)b(?=\1)c', 'abac')) 795 # Conditional group reference. 796 self.assertTrue(re.match(r'(?:(a)|(x))b(?=(?(2)x|c))c', 'abc')) 797 self.assertIsNone(re.match(r'(?:(a)|(x))b(?=(?(2)c|x))c', 'abc')) 798 self.assertTrue(re.match(r'(?:(a)|(x))b(?=(?(2)x|c))c', 'abc')) 799 self.assertIsNone(re.match(r'(?:(a)|(x))b(?=(?(1)b|x))c', 'abc')) 800 self.assertTrue(re.match(r'(?:(a)|(x))b(?=(?(1)c|x))c', 'abc')) 801 # Group used before defined. 802 self.assertTrue(re.match(r'(a)b(?=(?(2)x|c))(c)', 'abc')) 803 self.assertIsNone(re.match(r'(a)b(?=(?(2)b|x))(c)', 'abc')) 804 self.assertTrue(re.match(r'(a)b(?=(?(1)c|x))(c)', 'abc')) 805 806 def test_lookbehind(self): 807 self.assertTrue(re.match(r'ab(?<=b)c', 'abc')) 808 self.assertIsNone(re.match(r'ab(?<=c)c', 'abc')) 809 self.assertIsNone(re.match(r'ab(?<!b)c', 'abc')) 810 self.assertTrue(re.match(r'ab(?<!c)c', 'abc')) 811 # Group reference. 812 self.assertTrue(re.match(r'(a)a(?<=\1)c', 'aac')) 813 self.assertIsNone(re.match(r'(a)b(?<=\1)a', 'abaa')) 814 self.assertIsNone(re.match(r'(a)a(?<!\1)c', 'aac')) 815 self.assertTrue(re.match(r'(a)b(?<!\1)a', 'abaa')) 816 # Conditional group reference. 817 self.assertIsNone(re.match(r'(?:(a)|(x))b(?<=(?(2)x|c))c', 'abc')) 818 self.assertIsNone(re.match(r'(?:(a)|(x))b(?<=(?(2)b|x))c', 'abc')) 819 self.assertTrue(re.match(r'(?:(a)|(x))b(?<=(?(2)x|b))c', 'abc')) 820 self.assertIsNone(re.match(r'(?:(a)|(x))b(?<=(?(1)c|x))c', 'abc')) 821 self.assertTrue(re.match(r'(?:(a)|(x))b(?<=(?(1)b|x))c', 'abc')) 822 # Group used before defined. 823 self.assertRaises(re.error, re.compile, r'(a)b(?<=(?(2)b|x))(c)') 824 self.assertIsNone(re.match(r'(a)b(?<=(?(1)c|x))(c)', 'abc')) 825 self.assertTrue(re.match(r'(a)b(?<=(?(1)b|x))(c)', 'abc')) 826 # Group defined in the same lookbehind pattern 827 self.assertRaises(re.error, re.compile, r'(a)b(?<=(.)\2)(c)') 828 self.assertRaises(re.error, re.compile, r'(a)b(?<=(?P<a>.)(?P=a))(c)') 829 self.assertRaises(re.error, re.compile, r'(a)b(?<=(a)(?(2)b|x))(c)') 830 self.assertRaises(re.error, re.compile, r'(a)b(?<=(.)(?<=\2))(c)') 831 832 def test_ignore_case(self): 833 self.assertEqual(re.match("abc", "ABC", re.I).group(0), "ABC") 834 self.assertEqual(re.match(b"abc", b"ABC", re.I).group(0), b"ABC") 835 self.assertEqual(re.match(r"(a\s[^a])", "a b", re.I).group(1), "a b") 836 self.assertEqual(re.match(r"(a\s[^a]*)", "a bb", re.I).group(1), "a bb") 837 self.assertEqual(re.match(r"(a\s[abc])", "a b", re.I).group(1), "a b") 838 self.assertEqual(re.match(r"(a\s[abc]*)", "a bb", re.I).group(1), "a bb") 839 self.assertEqual(re.match(r"((a)\s\2)", "a a", re.I).group(1), "a a") 840 self.assertEqual(re.match(r"((a)\s\2*)", "a aa", re.I).group(1), "a aa") 841 self.assertEqual(re.match(r"((a)\s(abc|a))", "a a", re.I).group(1), "a a") 842 self.assertEqual(re.match(r"((a)\s(abc|a)*)", "a aa", re.I).group(1), "a aa") 843 844 assert '\u212a'.lower() == 'k' # 'K' 845 self.assertTrue(re.match(r'K', '\u212a', re.I)) 846 self.assertTrue(re.match(r'k', '\u212a', re.I)) 847 self.assertTrue(re.match(r'\u212a', 'K', re.I)) 848 self.assertTrue(re.match(r'\u212a', 'k', re.I)) 849 assert '\u017f'.upper() == 'S' # 'ſ' 850 self.assertTrue(re.match(r'S', '\u017f', re.I)) 851 self.assertTrue(re.match(r's', '\u017f', re.I)) 852 self.assertTrue(re.match(r'\u017f', 'S', re.I)) 853 self.assertTrue(re.match(r'\u017f', 's', re.I)) 854 assert '\ufb05'.upper() == '\ufb06'.upper() == 'ST' # 'ſt', 'st' 855 self.assertTrue(re.match(r'\ufb05', '\ufb06', re.I)) 856 self.assertTrue(re.match(r'\ufb06', '\ufb05', re.I)) 857 858 def test_ignore_case_set(self): 859 self.assertTrue(re.match(r'[19A]', 'A', re.I)) 860 self.assertTrue(re.match(r'[19a]', 'a', re.I)) 861 self.assertTrue(re.match(r'[19a]', 'A', re.I)) 862 self.assertTrue(re.match(r'[19A]', 'a', re.I)) 863 self.assertTrue(re.match(br'[19A]', b'A', re.I)) 864 self.assertTrue(re.match(br'[19a]', b'a', re.I)) 865 self.assertTrue(re.match(br'[19a]', b'A', re.I)) 866 self.assertTrue(re.match(br'[19A]', b'a', re.I)) 867 assert '\u212a'.lower() == 'k' # 'K' 868 self.assertTrue(re.match(r'[19K]', '\u212a', re.I)) 869 self.assertTrue(re.match(r'[19k]', '\u212a', re.I)) 870 self.assertTrue(re.match(r'[19\u212a]', 'K', re.I)) 871 self.assertTrue(re.match(r'[19\u212a]', 'k', re.I)) 872 assert '\u017f'.upper() == 'S' # 'ſ' 873 self.assertTrue(re.match(r'[19S]', '\u017f', re.I)) 874 self.assertTrue(re.match(r'[19s]', '\u017f', re.I)) 875 self.assertTrue(re.match(r'[19\u017f]', 'S', re.I)) 876 self.assertTrue(re.match(r'[19\u017f]', 's', re.I)) 877 assert '\ufb05'.upper() == '\ufb06'.upper() == 'ST' # 'ſt', 'st' 878 self.assertTrue(re.match(r'[19\ufb05]', '\ufb06', re.I)) 879 self.assertTrue(re.match(r'[19\ufb06]', '\ufb05', re.I)) 880 881 def test_ignore_case_range(self): 882 # Issues #3511, #17381. 883 self.assertTrue(re.match(r'[9-a]', '_', re.I)) 884 self.assertIsNone(re.match(r'[9-A]', '_', re.I)) 885 self.assertTrue(re.match(br'[9-a]', b'_', re.I)) 886 self.assertIsNone(re.match(br'[9-A]', b'_', re.I)) 887 self.assertTrue(re.match(r'[\xc0-\xde]', '\xd7', re.I)) 888 self.assertIsNone(re.match(r'[\xc0-\xde]', '\xf7', re.I)) 889 self.assertTrue(re.match(r'[\xe0-\xfe]', '\xf7', re.I)) 890 self.assertIsNone(re.match(r'[\xe0-\xfe]', '\xd7', re.I)) 891 self.assertTrue(re.match(r'[\u0430-\u045f]', '\u0450', re.I)) 892 self.assertTrue(re.match(r'[\u0430-\u045f]', '\u0400', re.I)) 893 self.assertTrue(re.match(r'[\u0400-\u042f]', '\u0450', re.I)) 894 self.assertTrue(re.match(r'[\u0400-\u042f]', '\u0400', re.I)) 895 self.assertTrue(re.match(r'[\U00010428-\U0001044f]', '\U00010428', re.I)) 896 self.assertTrue(re.match(r'[\U00010428-\U0001044f]', '\U00010400', re.I)) 897 self.assertTrue(re.match(r'[\U00010400-\U00010427]', '\U00010428', re.I)) 898 self.assertTrue(re.match(r'[\U00010400-\U00010427]', '\U00010400', re.I)) 899 900 assert '\u212a'.lower() == 'k' # 'K' 901 self.assertTrue(re.match(r'[J-M]', '\u212a', re.I)) 902 self.assertTrue(re.match(r'[j-m]', '\u212a', re.I)) 903 self.assertTrue(re.match(r'[\u2129-\u212b]', 'K', re.I)) 904 self.assertTrue(re.match(r'[\u2129-\u212b]', 'k', re.I)) 905 assert '\u017f'.upper() == 'S' # 'ſ' 906 self.assertTrue(re.match(r'[R-T]', '\u017f', re.I)) 907 self.assertTrue(re.match(r'[r-t]', '\u017f', re.I)) 908 self.assertTrue(re.match(r'[\u017e-\u0180]', 'S', re.I)) 909 self.assertTrue(re.match(r'[\u017e-\u0180]', 's', re.I)) 910 assert '\ufb05'.upper() == '\ufb06'.upper() == 'ST' # 'ſt', 'st' 911 self.assertTrue(re.match(r'[\ufb04-\ufb05]', '\ufb06', re.I)) 912 self.assertTrue(re.match(r'[\ufb06-\ufb07]', '\ufb05', re.I)) 913 914 def test_category(self): 915 self.assertEqual(re.match(r"(\s)", " ").group(1), " ") 916 917 @cpython_only 918 def test_case_helpers(self): 919 import _sre 920 for i in range(128): 921 c = chr(i) 922 lo = ord(c.lower()) 923 self.assertEqual(_sre.ascii_tolower(i), lo) 924 self.assertEqual(_sre.unicode_tolower(i), lo) 925 iscased = c in string.ascii_letters 926 self.assertEqual(_sre.ascii_iscased(i), iscased) 927 self.assertEqual(_sre.unicode_iscased(i), iscased) 928 929 for i in list(range(128, 0x1000)) + [0x10400, 0x10428]: 930 c = chr(i) 931 self.assertEqual(_sre.ascii_tolower(i), i) 932 if i != 0x0130: 933 self.assertEqual(_sre.unicode_tolower(i), ord(c.lower())) 934 iscased = c != c.lower() or c != c.upper() 935 self.assertFalse(_sre.ascii_iscased(i)) 936 self.assertEqual(_sre.unicode_iscased(i), 937 c != c.lower() or c != c.upper()) 938 939 self.assertEqual(_sre.ascii_tolower(0x0130), 0x0130) 940 self.assertEqual(_sre.unicode_tolower(0x0130), ord('i')) 941 self.assertFalse(_sre.ascii_iscased(0x0130)) 942 self.assertTrue(_sre.unicode_iscased(0x0130)) 943 944 def test_not_literal(self): 945 self.assertEqual(re.search(r"\s([^a])", " b").group(1), "b") 946 self.assertEqual(re.search(r"\s([^a]*)", " bb").group(1), "bb") 947 948 def test_possible_set_operations(self): 949 s = bytes(range(128)).decode() 950 with self.assertWarns(FutureWarning): 951 p = re.compile(r'[0-9--1]') 952 self.assertEqual(p.findall(s), list('-./0123456789')) 953 self.assertEqual(re.findall(r'[--1]', s), list('-./01')) 954 with self.assertWarns(FutureWarning): 955 p = re.compile(r'[%--1]') 956 self.assertEqual(p.findall(s), list("%&'()*+,-1")) 957 with self.assertWarns(FutureWarning): 958 p = re.compile(r'[%--]') 959 self.assertEqual(p.findall(s), list("%&'()*+,-")) 960 961 with self.assertWarns(FutureWarning): 962 p = re.compile(r'[0-9&&1]') 963 self.assertEqual(p.findall(s), list('&0123456789')) 964 with self.assertWarns(FutureWarning): 965 p = re.compile(r'[\d&&1]') 966 self.assertEqual(p.findall(s), list('&0123456789')) 967 self.assertEqual(re.findall(r'[&&1]', s), list('&1')) 968 969 with self.assertWarns(FutureWarning): 970 p = re.compile(r'[0-9||a]') 971 self.assertEqual(p.findall(s), list('0123456789a|')) 972 with self.assertWarns(FutureWarning): 973 p = re.compile(r'[\d||a]') 974 self.assertEqual(p.findall(s), list('0123456789a|')) 975 self.assertEqual(re.findall(r'[||1]', s), list('1|')) 976 977 with self.assertWarns(FutureWarning): 978 p = re.compile(r'[0-9~~1]') 979 self.assertEqual(p.findall(s), list('0123456789~')) 980 with self.assertWarns(FutureWarning): 981 p = re.compile(r'[\d~~1]') 982 self.assertEqual(p.findall(s), list('0123456789~')) 983 self.assertEqual(re.findall(r'[~~1]', s), list('1~')) 984 985 with self.assertWarns(FutureWarning): 986 p = re.compile(r'[[0-9]|]') 987 self.assertEqual(p.findall(s), list('0123456789[]')) 988 989 with self.assertWarns(FutureWarning): 990 p = re.compile(r'[[:digit:]|]') 991 self.assertEqual(p.findall(s), list(':[]dgit')) 992 993 def test_search_coverage(self): 994 self.assertEqual(re.search(r"\s(b)", " b").group(1), "b") 995 self.assertEqual(re.search(r"a\s", "a ").group(0), "a ") 996 997 def assertMatch(self, pattern, text, match=None, span=None, 998 matcher=re.fullmatch): 999 if match is None and span is None: 1000 # the pattern matches the whole text 1001 match = text 1002 span = (0, len(text)) 1003 elif match is None or span is None: 1004 raise ValueError('If match is not None, span should be specified ' 1005 '(and vice versa).') 1006 m = matcher(pattern, text) 1007 self.assertTrue(m) 1008 self.assertEqual(m.group(), match) 1009 self.assertEqual(m.span(), span) 1010 1011 LITERAL_CHARS = string.ascii_letters + string.digits + '!"%\',/:;<=>@_`' 1012 1013 def test_re_escape(self): 1014 p = ''.join(chr(i) for i in range(256)) 1015 for c in p: 1016 self.assertMatch(re.escape(c), c) 1017 self.assertMatch('[' + re.escape(c) + ']', c) 1018 self.assertMatch('(?x)' + re.escape(c), c) 1019 self.assertMatch(re.escape(p), p) 1020 for c in '-.]{}': 1021 self.assertEqual(re.escape(c)[:1], '\\') 1022 literal_chars = self.LITERAL_CHARS 1023 self.assertEqual(re.escape(literal_chars), literal_chars) 1024 1025 def test_re_escape_bytes(self): 1026 p = bytes(range(256)) 1027 for i in p: 1028 b = bytes([i]) 1029 self.assertMatch(re.escape(b), b) 1030 self.assertMatch(b'[' + re.escape(b) + b']', b) 1031 self.assertMatch(b'(?x)' + re.escape(b), b) 1032 self.assertMatch(re.escape(p), p) 1033 for i in b'-.]{}': 1034 b = bytes([i]) 1035 self.assertEqual(re.escape(b)[:1], b'\\') 1036 literal_chars = self.LITERAL_CHARS.encode('ascii') 1037 self.assertEqual(re.escape(literal_chars), literal_chars) 1038 1039 def test_re_escape_non_ascii(self): 1040 s = 'xxx\u2620\u2620\u2620xxx' 1041 s_escaped = re.escape(s) 1042 self.assertEqual(s_escaped, s) 1043 self.assertMatch(s_escaped, s) 1044 self.assertMatch('.%s+.' % re.escape('\u2620'), s, 1045 'x\u2620\u2620\u2620x', (2, 7), re.search) 1046 1047 def test_re_escape_non_ascii_bytes(self): 1048 b = 'y\u2620y\u2620y'.encode('utf-8') 1049 b_escaped = re.escape(b) 1050 self.assertEqual(b_escaped, b) 1051 self.assertMatch(b_escaped, b) 1052 res = re.findall(re.escape('\u2620'.encode('utf-8')), b) 1053 self.assertEqual(len(res), 2) 1054 1055 def test_pickling(self): 1056 import pickle 1057 oldpat = re.compile('a(?:b|(c|e){1,2}?|d)+?(.)', re.UNICODE) 1058 for proto in range(pickle.HIGHEST_PROTOCOL + 1): 1059 pickled = pickle.dumps(oldpat, proto) 1060 newpat = pickle.loads(pickled) 1061 self.assertEqual(newpat, oldpat) 1062 # current pickle expects the _compile() reconstructor in re module 1063 from re import _compile 1064 1065 def test_copying(self): 1066 import copy 1067 p = re.compile(r'(?P<int>\d+)(?:\.(?P<frac>\d*))?') 1068 self.assertIs(copy.copy(p), p) 1069 self.assertIs(copy.deepcopy(p), p) 1070 m = p.match('12.34') 1071 self.assertIs(copy.copy(m), m) 1072 self.assertIs(copy.deepcopy(m), m) 1073 1074 def test_constants(self): 1075 self.assertEqual(re.I, re.IGNORECASE) 1076 self.assertEqual(re.L, re.LOCALE) 1077 self.assertEqual(re.M, re.MULTILINE) 1078 self.assertEqual(re.S, re.DOTALL) 1079 self.assertEqual(re.X, re.VERBOSE) 1080 1081 def test_flags(self): 1082 for flag in [re.I, re.M, re.X, re.S, re.A, re.U]: 1083 self.assertTrue(re.compile('^pattern$', flag)) 1084 for flag in [re.I, re.M, re.X, re.S, re.A, re.L]: 1085 self.assertTrue(re.compile(b'^pattern$', flag)) 1086 1087 def test_sre_character_literals(self): 1088 for i in [0, 8, 16, 32, 64, 127, 128, 255, 256, 0xFFFF, 0x10000, 0x10FFFF]: 1089 if i < 256: 1090 self.assertTrue(re.match(r"\%03o" % i, chr(i))) 1091 self.assertTrue(re.match(r"\%03o0" % i, chr(i)+"0")) 1092 self.assertTrue(re.match(r"\%03o8" % i, chr(i)+"8")) 1093 self.assertTrue(re.match(r"\x%02x" % i, chr(i))) 1094 self.assertTrue(re.match(r"\x%02x0" % i, chr(i)+"0")) 1095 self.assertTrue(re.match(r"\x%02xz" % i, chr(i)+"z")) 1096 if i < 0x10000: 1097 self.assertTrue(re.match(r"\u%04x" % i, chr(i))) 1098 self.assertTrue(re.match(r"\u%04x0" % i, chr(i)+"0")) 1099 self.assertTrue(re.match(r"\u%04xz" % i, chr(i)+"z")) 1100 self.assertTrue(re.match(r"\U%08x" % i, chr(i))) 1101 self.assertTrue(re.match(r"\U%08x0" % i, chr(i)+"0")) 1102 self.assertTrue(re.match(r"\U%08xz" % i, chr(i)+"z")) 1103 self.assertTrue(re.match(r"\0", "\000")) 1104 self.assertTrue(re.match(r"\08", "\0008")) 1105 self.assertTrue(re.match(r"\01", "\001")) 1106 self.assertTrue(re.match(r"\018", "\0018")) 1107 self.checkPatternError(r"\567", 1108 r'octal escape value \567 outside of ' 1109 r'range 0-0o377', 0) 1110 self.checkPatternError(r"\911", 'invalid group reference 91', 1) 1111 self.checkPatternError(r"\x1", r'incomplete escape \x1', 0) 1112 self.checkPatternError(r"\x1z", r'incomplete escape \x1', 0) 1113 self.checkPatternError(r"\u123", r'incomplete escape \u123', 0) 1114 self.checkPatternError(r"\u123z", r'incomplete escape \u123', 0) 1115 self.checkPatternError(r"\U0001234", r'incomplete escape \U0001234', 0) 1116 self.checkPatternError(r"\U0001234z", r'incomplete escape \U0001234', 0) 1117 self.checkPatternError(r"\U00110000", r'bad escape \U00110000', 0) 1118 1119 def test_sre_character_class_literals(self): 1120 for i in [0, 8, 16, 32, 64, 127, 128, 255, 256, 0xFFFF, 0x10000, 0x10FFFF]: 1121 if i < 256: 1122 self.assertTrue(re.match(r"[\%o]" % i, chr(i))) 1123 self.assertTrue(re.match(r"[\%o8]" % i, chr(i))) 1124 self.assertTrue(re.match(r"[\%03o]" % i, chr(i))) 1125 self.assertTrue(re.match(r"[\%03o0]" % i, chr(i))) 1126 self.assertTrue(re.match(r"[\%03o8]" % i, chr(i))) 1127 self.assertTrue(re.match(r"[\x%02x]" % i, chr(i))) 1128 self.assertTrue(re.match(r"[\x%02x0]" % i, chr(i))) 1129 self.assertTrue(re.match(r"[\x%02xz]" % i, chr(i))) 1130 if i < 0x10000: 1131 self.assertTrue(re.match(r"[\u%04x]" % i, chr(i))) 1132 self.assertTrue(re.match(r"[\u%04x0]" % i, chr(i))) 1133 self.assertTrue(re.match(r"[\u%04xz]" % i, chr(i))) 1134 self.assertTrue(re.match(r"[\U%08x]" % i, chr(i))) 1135 self.assertTrue(re.match(r"[\U%08x0]" % i, chr(i)+"0")) 1136 self.assertTrue(re.match(r"[\U%08xz]" % i, chr(i)+"z")) 1137 self.checkPatternError(r"[\567]", 1138 r'octal escape value \567 outside of ' 1139 r'range 0-0o377', 1) 1140 self.checkPatternError(r"[\911]", r'bad escape \9', 1) 1141 self.checkPatternError(r"[\x1z]", r'incomplete escape \x1', 1) 1142 self.checkPatternError(r"[\u123z]", r'incomplete escape \u123', 1) 1143 self.checkPatternError(r"[\U0001234z]", r'incomplete escape \U0001234', 1) 1144 self.checkPatternError(r"[\U00110000]", r'bad escape \U00110000', 1) 1145 self.assertTrue(re.match(r"[\U0001d49c-\U0001d4b5]", "\U0001d49e")) 1146 1147 def test_sre_byte_literals(self): 1148 for i in [0, 8, 16, 32, 64, 127, 128, 255]: 1149 self.assertTrue(re.match((r"\%03o" % i).encode(), bytes([i]))) 1150 self.assertTrue(re.match((r"\%03o0" % i).encode(), bytes([i])+b"0")) 1151 self.assertTrue(re.match((r"\%03o8" % i).encode(), bytes([i])+b"8")) 1152 self.assertTrue(re.match((r"\x%02x" % i).encode(), bytes([i]))) 1153 self.assertTrue(re.match((r"\x%02x0" % i).encode(), bytes([i])+b"0")) 1154 self.assertTrue(re.match((r"\x%02xz" % i).encode(), bytes([i])+b"z")) 1155 self.assertRaises(re.error, re.compile, br"\u1234") 1156 self.assertRaises(re.error, re.compile, br"\U00012345") 1157 self.assertTrue(re.match(br"\0", b"\000")) 1158 self.assertTrue(re.match(br"\08", b"\0008")) 1159 self.assertTrue(re.match(br"\01", b"\001")) 1160 self.assertTrue(re.match(br"\018", b"\0018")) 1161 self.checkPatternError(br"\567", 1162 r'octal escape value \567 outside of ' 1163 r'range 0-0o377', 0) 1164 self.checkPatternError(br"\911", 'invalid group reference 91', 1) 1165 self.checkPatternError(br"\x1", r'incomplete escape \x1', 0) 1166 self.checkPatternError(br"\x1z", r'incomplete escape \x1', 0) 1167 1168 def test_sre_byte_class_literals(self): 1169 for i in [0, 8, 16, 32, 64, 127, 128, 255]: 1170 self.assertTrue(re.match((r"[\%o]" % i).encode(), bytes([i]))) 1171 self.assertTrue(re.match((r"[\%o8]" % i).encode(), bytes([i]))) 1172 self.assertTrue(re.match((r"[\%03o]" % i).encode(), bytes([i]))) 1173 self.assertTrue(re.match((r"[\%03o0]" % i).encode(), bytes([i]))) 1174 self.assertTrue(re.match((r"[\%03o8]" % i).encode(), bytes([i]))) 1175 self.assertTrue(re.match((r"[\x%02x]" % i).encode(), bytes([i]))) 1176 self.assertTrue(re.match((r"[\x%02x0]" % i).encode(), bytes([i]))) 1177 self.assertTrue(re.match((r"[\x%02xz]" % i).encode(), bytes([i]))) 1178 self.assertRaises(re.error, re.compile, br"[\u1234]") 1179 self.assertRaises(re.error, re.compile, br"[\U00012345]") 1180 self.checkPatternError(br"[\567]", 1181 r'octal escape value \567 outside of ' 1182 r'range 0-0o377', 1) 1183 self.checkPatternError(br"[\911]", r'bad escape \9', 1) 1184 self.checkPatternError(br"[\x1z]", r'incomplete escape \x1', 1) 1185 1186 def test_character_set_errors(self): 1187 self.checkPatternError(r'[', 'unterminated character set', 0) 1188 self.checkPatternError(r'[^', 'unterminated character set', 0) 1189 self.checkPatternError(r'[a', 'unterminated character set', 0) 1190 # bug 545855 -- This pattern failed to cause a compile error as it 1191 # should, instead provoking a TypeError. 1192 self.checkPatternError(r"[a-", 'unterminated character set', 0) 1193 self.checkPatternError(r"[\w-b]", r'bad character range \w-b', 1) 1194 self.checkPatternError(r"[a-\w]", r'bad character range a-\w', 1) 1195 self.checkPatternError(r"[b-a]", 'bad character range b-a', 1) 1196 1197 def test_bug_113254(self): 1198 self.assertEqual(re.match(r'(a)|(b)', 'b').start(1), -1) 1199 self.assertEqual(re.match(r'(a)|(b)', 'b').end(1), -1) 1200 self.assertEqual(re.match(r'(a)|(b)', 'b').span(1), (-1, -1)) 1201 1202 def test_bug_527371(self): 1203 # bug described in patches 527371/672491 1204 self.assertIsNone(re.match(r'(a)?a','a').lastindex) 1205 self.assertEqual(re.match(r'(a)(b)?b','ab').lastindex, 1) 1206 self.assertEqual(re.match(r'(?P<a>a)(?P<b>b)?b','ab').lastgroup, 'a') 1207 self.assertEqual(re.match(r"(?P<a>a(b))", "ab").lastgroup, 'a') 1208 self.assertEqual(re.match(r"((a))", "a").lastindex, 1) 1209 1210 def test_bug_418626(self): 1211 # bugs 418626 at al. -- Testing Greg Chapman's addition of op code 1212 # SRE_OP_MIN_REPEAT_ONE for eliminating recursion on simple uses of 1213 # pattern '*?' on a long string. 1214 self.assertEqual(re.match('.*?c', 10000*'ab'+'cd').end(0), 20001) 1215 self.assertEqual(re.match('.*?cd', 5000*'ab'+'c'+5000*'ab'+'cde').end(0), 1216 20003) 1217 self.assertEqual(re.match('.*?cd', 20000*'abc'+'de').end(0), 60001) 1218 # non-simple '*?' still used to hit the recursion limit, before the 1219 # non-recursive scheme was implemented. 1220 self.assertEqual(re.search('(a|b)*?c', 10000*'ab'+'cd').end(0), 20001) 1221 1222 def test_bug_612074(self): 1223 pat="["+re.escape("\u2039")+"]" 1224 self.assertEqual(re.compile(pat) and 1, 1) 1225 1226 def test_stack_overflow(self): 1227 # nasty cases that used to overflow the straightforward recursive 1228 # implementation of repeated groups. 1229 self.assertEqual(re.match('(x)*', 50000*'x').group(1), 'x') 1230 self.assertEqual(re.match('(x)*y', 50000*'x'+'y').group(1), 'x') 1231 self.assertEqual(re.match('(x)*?y', 50000*'x'+'y').group(1), 'x') 1232 1233 def test_nothing_to_repeat(self): 1234 for reps in '*', '+', '?', '{1,2}': 1235 for mod in '', '?': 1236 self.checkPatternError('%s%s' % (reps, mod), 1237 'nothing to repeat', 0) 1238 self.checkPatternError('(?:%s%s)' % (reps, mod), 1239 'nothing to repeat', 3) 1240 1241 def test_multiple_repeat(self): 1242 for outer_reps in '*', '+', '{1,2}': 1243 for outer_mod in '', '?': 1244 outer_op = outer_reps + outer_mod 1245 for inner_reps in '*', '+', '?', '{1,2}': 1246 for inner_mod in '', '?': 1247 inner_op = inner_reps + inner_mod 1248 self.checkPatternError(r'x%s%s' % (inner_op, outer_op), 1249 'multiple repeat', 1 + len(inner_op)) 1250 1251 def test_unlimited_zero_width_repeat(self): 1252 # Issue #9669 1253 self.assertIsNone(re.match(r'(?:a?)*y', 'z')) 1254 self.assertIsNone(re.match(r'(?:a?)+y', 'z')) 1255 self.assertIsNone(re.match(r'(?:a?){2,}y', 'z')) 1256 self.assertIsNone(re.match(r'(?:a?)*?y', 'z')) 1257 self.assertIsNone(re.match(r'(?:a?)+?y', 'z')) 1258 self.assertIsNone(re.match(r'(?:a?){2,}?y', 'z')) 1259 1260 def test_scanner(self): 1261 def s_ident(scanner, token): return token 1262 def s_operator(scanner, token): return "op%s" % token 1263 def s_float(scanner, token): return float(token) 1264 def s_int(scanner, token): return int(token) 1265 1266 scanner = Scanner([ 1267 (r"[a-zA-Z_]\w*", s_ident), 1268 (r"\d+\.\d*", s_float), 1269 (r"\d+", s_int), 1270 (r"=|\+|-|\*|/", s_operator), 1271 (r"\s+", None), 1272 ]) 1273 1274 self.assertTrue(scanner.scanner.scanner("").pattern) 1275 1276 self.assertEqual(scanner.scan("sum = 3*foo + 312.50 + bar"), 1277 (['sum', 'op=', 3, 'op*', 'foo', 'op+', 312.5, 1278 'op+', 'bar'], '')) 1279 1280 def test_bug_448951(self): 1281 # bug 448951 (similar to 429357, but with single char match) 1282 # (Also test greedy matches.) 1283 for op in '','?','*': 1284 self.assertEqual(re.match(r'((.%s):)?z'%op, 'z').groups(), 1285 (None, None)) 1286 self.assertEqual(re.match(r'((.%s):)?z'%op, 'a:z').groups(), 1287 ('a:', 'a')) 1288 1289 def test_bug_725106(self): 1290 # capturing groups in alternatives in repeats 1291 self.assertEqual(re.match('^((a)|b)*', 'abc').groups(), 1292 ('b', 'a')) 1293 self.assertEqual(re.match('^(([ab])|c)*', 'abc').groups(), 1294 ('c', 'b')) 1295 self.assertEqual(re.match('^((d)|[ab])*', 'abc').groups(), 1296 ('b', None)) 1297 self.assertEqual(re.match('^((a)c|[ab])*', 'abc').groups(), 1298 ('b', None)) 1299 self.assertEqual(re.match('^((a)|b)*?c', 'abc').groups(), 1300 ('b', 'a')) 1301 self.assertEqual(re.match('^(([ab])|c)*?d', 'abcd').groups(), 1302 ('c', 'b')) 1303 self.assertEqual(re.match('^((d)|[ab])*?c', 'abc').groups(), 1304 ('b', None)) 1305 self.assertEqual(re.match('^((a)c|[ab])*?c', 'abc').groups(), 1306 ('b', None)) 1307 1308 def test_bug_725149(self): 1309 # mark_stack_base restoring before restoring marks 1310 self.assertEqual(re.match('(a)(?:(?=(b)*)c)*', 'abb').groups(), 1311 ('a', None)) 1312 self.assertEqual(re.match('(a)((?!(b)*))*', 'abb').groups(), 1313 ('a', None, None)) 1314 1315 def test_bug_764548(self): 1316 # bug 764548, re.compile() barfs on str/unicode subclasses 1317 class my_unicode(str): pass 1318 pat = re.compile(my_unicode("abc")) 1319 self.assertIsNone(pat.match("xyz")) 1320 1321 def test_finditer(self): 1322 iter = re.finditer(r":+", "a:b::c:::d") 1323 self.assertEqual([item.group(0) for item in iter], 1324 [":", "::", ":::"]) 1325 1326 pat = re.compile(r":+") 1327 iter = pat.finditer("a:b::c:::d", 1, 10) 1328 self.assertEqual([item.group(0) for item in iter], 1329 [":", "::", ":::"]) 1330 1331 pat = re.compile(r":+") 1332 iter = pat.finditer("a:b::c:::d", pos=1, endpos=10) 1333 self.assertEqual([item.group(0) for item in iter], 1334 [":", "::", ":::"]) 1335 1336 pat = re.compile(r":+") 1337 iter = pat.finditer("a:b::c:::d", endpos=10, pos=1) 1338 self.assertEqual([item.group(0) for item in iter], 1339 [":", "::", ":::"]) 1340 1341 pat = re.compile(r":+") 1342 iter = pat.finditer("a:b::c:::d", pos=3, endpos=8) 1343 self.assertEqual([item.group(0) for item in iter], 1344 ["::", "::"]) 1345 1346 def test_bug_926075(self): 1347 self.assertIsNot(re.compile('bug_926075'), 1348 re.compile(b'bug_926075')) 1349 1350 def test_bug_931848(self): 1351 pattern = "[\u002E\u3002\uFF0E\uFF61]" 1352 self.assertEqual(re.compile(pattern).split("a.b.c"), 1353 ['a','b','c']) 1354 1355 def test_bug_581080(self): 1356 iter = re.finditer(r"\s", "a b") 1357 self.assertEqual(next(iter).span(), (1,2)) 1358 self.assertRaises(StopIteration, next, iter) 1359 1360 scanner = re.compile(r"\s").scanner("a b") 1361 self.assertEqual(scanner.search().span(), (1, 2)) 1362 self.assertIsNone(scanner.search()) 1363 1364 def test_bug_817234(self): 1365 iter = re.finditer(r".*", "asdf") 1366 self.assertEqual(next(iter).span(), (0, 4)) 1367 self.assertEqual(next(iter).span(), (4, 4)) 1368 self.assertRaises(StopIteration, next, iter) 1369 1370 def test_bug_6561(self): 1371 # '\d' should match characters in Unicode category 'Nd' 1372 # (Number, Decimal Digit), but not those in 'Nl' (Number, 1373 # Letter) or 'No' (Number, Other). 1374 decimal_digits = [ 1375 '\u0037', # '\N{DIGIT SEVEN}', category 'Nd' 1376 '\u0e58', # '\N{THAI DIGIT SIX}', category 'Nd' 1377 '\uff10', # '\N{FULLWIDTH DIGIT ZERO}', category 'Nd' 1378 ] 1379 for x in decimal_digits: 1380 self.assertEqual(re.match(r'^\d$', x).group(0), x) 1381 1382 not_decimal_digits = [ 1383 '\u2165', # '\N{ROMAN NUMERAL SIX}', category 'Nl' 1384 '\u3039', # '\N{HANGZHOU NUMERAL TWENTY}', category 'Nl' 1385 '\u2082', # '\N{SUBSCRIPT TWO}', category 'No' 1386 '\u32b4', # '\N{CIRCLED NUMBER THIRTY NINE}', category 'No' 1387 ] 1388 for x in not_decimal_digits: 1389 self.assertIsNone(re.match(r'^\d$', x)) 1390 1391 def test_empty_array(self): 1392 # SF buf 1647541 1393 import array 1394 for typecode in 'bBuhHiIlLfd': 1395 a = array.array(typecode) 1396 self.assertIsNone(re.compile(b"bla").match(a)) 1397 self.assertEqual(re.compile(b"").match(a).groups(), ()) 1398 1399 def test_inline_flags(self): 1400 # Bug #1700 1401 upper_char = '\u1ea0' # Latin Capital Letter A with Dot Below 1402 lower_char = '\u1ea1' # Latin Small Letter A with Dot Below 1403 1404 p = re.compile('.' + upper_char, re.I | re.S) 1405 q = p.match('\n' + lower_char) 1406 self.assertTrue(q) 1407 1408 p = re.compile('.' + lower_char, re.I | re.S) 1409 q = p.match('\n' + upper_char) 1410 self.assertTrue(q) 1411 1412 p = re.compile('(?i).' + upper_char, re.S) 1413 q = p.match('\n' + lower_char) 1414 self.assertTrue(q) 1415 1416 p = re.compile('(?i).' + lower_char, re.S) 1417 q = p.match('\n' + upper_char) 1418 self.assertTrue(q) 1419 1420 p = re.compile('(?is).' + upper_char) 1421 q = p.match('\n' + lower_char) 1422 self.assertTrue(q) 1423 1424 p = re.compile('(?is).' + lower_char) 1425 q = p.match('\n' + upper_char) 1426 self.assertTrue(q) 1427 1428 p = re.compile('(?s)(?i).' + upper_char) 1429 q = p.match('\n' + lower_char) 1430 self.assertTrue(q) 1431 1432 p = re.compile('(?s)(?i).' + lower_char) 1433 q = p.match('\n' + upper_char) 1434 self.assertTrue(q) 1435 1436 self.assertTrue(re.match('(?ix) ' + upper_char, lower_char)) 1437 self.assertTrue(re.match('(?ix) ' + lower_char, upper_char)) 1438 self.assertTrue(re.match(' (?i) ' + upper_char, lower_char, re.X)) 1439 self.assertTrue(re.match('(?x) (?i) ' + upper_char, lower_char)) 1440 self.assertTrue(re.match(' (?x) (?i) ' + upper_char, lower_char, re.X)) 1441 1442 p = upper_char + '(?i)' 1443 with self.assertWarns(DeprecationWarning) as warns: 1444 self.assertTrue(re.match(p, lower_char)) 1445 self.assertEqual( 1446 str(warns.warnings[0].message), 1447 'Flags not at the start of the expression %r' % p 1448 ) 1449 self.assertEqual(warns.warnings[0].filename, __file__) 1450 1451 p = upper_char + '(?i)%s' % ('.?' * 100) 1452 with self.assertWarns(DeprecationWarning) as warns: 1453 self.assertTrue(re.match(p, lower_char)) 1454 self.assertEqual( 1455 str(warns.warnings[0].message), 1456 'Flags not at the start of the expression %r (truncated)' % p[:20] 1457 ) 1458 self.assertEqual(warns.warnings[0].filename, __file__) 1459 1460 # bpo-30605: Compiling a bytes instance regex was throwing a BytesWarning 1461 with warnings.catch_warnings(): 1462 warnings.simplefilter('error', BytesWarning) 1463 p = b'A(?i)' 1464 with self.assertWarns(DeprecationWarning) as warns: 1465 self.assertTrue(re.match(p, b'a')) 1466 self.assertEqual( 1467 str(warns.warnings[0].message), 1468 'Flags not at the start of the expression %r' % p 1469 ) 1470 self.assertEqual(warns.warnings[0].filename, __file__) 1471 1472 with self.assertWarns(DeprecationWarning): 1473 self.assertTrue(re.match('(?s).(?i)' + upper_char, '\n' + lower_char)) 1474 with self.assertWarns(DeprecationWarning): 1475 self.assertTrue(re.match('(?i) ' + upper_char + ' (?x)', lower_char)) 1476 with self.assertWarns(DeprecationWarning): 1477 self.assertTrue(re.match(' (?x) (?i) ' + upper_char, lower_char)) 1478 with self.assertWarns(DeprecationWarning): 1479 self.assertTrue(re.match('^(?i)' + upper_char, lower_char)) 1480 with self.assertWarns(DeprecationWarning): 1481 self.assertTrue(re.match('$|(?i)' + upper_char, lower_char)) 1482 with self.assertWarns(DeprecationWarning) as warns: 1483 self.assertTrue(re.match('(?:(?i)' + upper_char + ')', lower_char)) 1484 self.assertRegex(str(warns.warnings[0].message), 1485 'Flags not at the start') 1486 self.assertEqual(warns.warnings[0].filename, __file__) 1487 with self.assertWarns(DeprecationWarning) as warns: 1488 self.assertTrue(re.fullmatch('(^)?(?(1)(?i)' + upper_char + ')', 1489 lower_char)) 1490 self.assertRegex(str(warns.warnings[0].message), 1491 'Flags not at the start') 1492 self.assertEqual(warns.warnings[0].filename, __file__) 1493 with self.assertWarns(DeprecationWarning) as warns: 1494 self.assertTrue(re.fullmatch('($)?(?(1)|(?i)' + upper_char + ')', 1495 lower_char)) 1496 self.assertRegex(str(warns.warnings[0].message), 1497 'Flags not at the start') 1498 self.assertEqual(warns.warnings[0].filename, __file__) 1499 1500 1501 def test_dollar_matches_twice(self): 1502 "$ matches the end of string, and just before the terminating \n" 1503 pattern = re.compile('$') 1504 self.assertEqual(pattern.sub('#', 'a\nb\n'), 'a\nb#\n#') 1505 self.assertEqual(pattern.sub('#', 'a\nb\nc'), 'a\nb\nc#') 1506 self.assertEqual(pattern.sub('#', '\n'), '#\n#') 1507 1508 pattern = re.compile('$', re.MULTILINE) 1509 self.assertEqual(pattern.sub('#', 'a\nb\n' ), 'a#\nb#\n#' ) 1510 self.assertEqual(pattern.sub('#', 'a\nb\nc'), 'a#\nb#\nc#') 1511 self.assertEqual(pattern.sub('#', '\n'), '#\n#') 1512 1513 def test_bytes_str_mixing(self): 1514 # Mixing str and bytes is disallowed 1515 pat = re.compile('.') 1516 bpat = re.compile(b'.') 1517 self.assertRaises(TypeError, pat.match, b'b') 1518 self.assertRaises(TypeError, bpat.match, 'b') 1519 self.assertRaises(TypeError, pat.sub, b'b', 'c') 1520 self.assertRaises(TypeError, pat.sub, 'b', b'c') 1521 self.assertRaises(TypeError, pat.sub, b'b', b'c') 1522 self.assertRaises(TypeError, bpat.sub, b'b', 'c') 1523 self.assertRaises(TypeError, bpat.sub, 'b', b'c') 1524 self.assertRaises(TypeError, bpat.sub, 'b', 'c') 1525 1526 def test_ascii_and_unicode_flag(self): 1527 # String patterns 1528 for flags in (0, re.UNICODE): 1529 pat = re.compile('\xc0', flags | re.IGNORECASE) 1530 self.assertTrue(pat.match('\xe0')) 1531 pat = re.compile(r'\w', flags) 1532 self.assertTrue(pat.match('\xe0')) 1533 pat = re.compile('\xc0', re.ASCII | re.IGNORECASE) 1534 self.assertIsNone(pat.match('\xe0')) 1535 pat = re.compile('(?a)\xc0', re.IGNORECASE) 1536 self.assertIsNone(pat.match('\xe0')) 1537 pat = re.compile(r'\w', re.ASCII) 1538 self.assertIsNone(pat.match('\xe0')) 1539 pat = re.compile(r'(?a)\w') 1540 self.assertIsNone(pat.match('\xe0')) 1541 # Bytes patterns 1542 for flags in (0, re.ASCII): 1543 pat = re.compile(b'\xc0', flags | re.IGNORECASE) 1544 self.assertIsNone(pat.match(b'\xe0')) 1545 pat = re.compile(br'\w', flags) 1546 self.assertIsNone(pat.match(b'\xe0')) 1547 # Incompatibilities 1548 self.assertRaises(ValueError, re.compile, br'\w', re.UNICODE) 1549 self.assertRaises(re.error, re.compile, br'(?u)\w') 1550 self.assertRaises(ValueError, re.compile, r'\w', re.UNICODE | re.ASCII) 1551 self.assertRaises(ValueError, re.compile, r'(?u)\w', re.ASCII) 1552 self.assertRaises(ValueError, re.compile, r'(?a)\w', re.UNICODE) 1553 self.assertRaises(re.error, re.compile, r'(?au)\w') 1554 1555 def test_locale_flag(self): 1556 enc = locale.getpreferredencoding() 1557 # Search non-ASCII letter 1558 for i in range(128, 256): 1559 try: 1560 c = bytes([i]).decode(enc) 1561 sletter = c.lower() 1562 if sletter == c: continue 1563 bletter = sletter.encode(enc) 1564 if len(bletter) != 1: continue 1565 if bletter.decode(enc) != sletter: continue 1566 bpat = re.escape(bytes([i])) 1567 break 1568 except (UnicodeError, TypeError): 1569 pass 1570 else: 1571 bletter = None 1572 bpat = b'A' 1573 # Bytes patterns 1574 pat = re.compile(bpat, re.LOCALE | re.IGNORECASE) 1575 if bletter: 1576 self.assertTrue(pat.match(bletter)) 1577 pat = re.compile(b'(?L)' + bpat, re.IGNORECASE) 1578 if bletter: 1579 self.assertTrue(pat.match(bletter)) 1580 pat = re.compile(bpat, re.IGNORECASE) 1581 if bletter: 1582 self.assertIsNone(pat.match(bletter)) 1583 pat = re.compile(br'\w', re.LOCALE) 1584 if bletter: 1585 self.assertTrue(pat.match(bletter)) 1586 pat = re.compile(br'(?L)\w') 1587 if bletter: 1588 self.assertTrue(pat.match(bletter)) 1589 pat = re.compile(br'\w') 1590 if bletter: 1591 self.assertIsNone(pat.match(bletter)) 1592 # Incompatibilities 1593 self.assertRaises(ValueError, re.compile, '', re.LOCALE) 1594 self.assertRaises(re.error, re.compile, '(?L)') 1595 self.assertRaises(ValueError, re.compile, b'', re.LOCALE | re.ASCII) 1596 self.assertRaises(ValueError, re.compile, b'(?L)', re.ASCII) 1597 self.assertRaises(ValueError, re.compile, b'(?a)', re.LOCALE) 1598 self.assertRaises(re.error, re.compile, b'(?aL)') 1599 1600 def test_scoped_flags(self): 1601 self.assertTrue(re.match(r'(?i:a)b', 'Ab')) 1602 self.assertIsNone(re.match(r'(?i:a)b', 'aB')) 1603 self.assertIsNone(re.match(r'(?-i:a)b', 'Ab', re.IGNORECASE)) 1604 self.assertTrue(re.match(r'(?-i:a)b', 'aB', re.IGNORECASE)) 1605 self.assertIsNone(re.match(r'(?i:(?-i:a)b)', 'Ab')) 1606 self.assertTrue(re.match(r'(?i:(?-i:a)b)', 'aB')) 1607 1608 self.assertTrue(re.match(r'(?x: a) b', 'a b')) 1609 self.assertIsNone(re.match(r'(?x: a) b', ' a b')) 1610 self.assertTrue(re.match(r'(?-x: a) b', ' ab', re.VERBOSE)) 1611 self.assertIsNone(re.match(r'(?-x: a) b', 'ab', re.VERBOSE)) 1612 1613 self.assertTrue(re.match(r'\w(?a:\W)\w', '\xe0\xe0\xe0')) 1614 self.assertTrue(re.match(r'(?a:\W(?u:\w)\W)', '\xe0\xe0\xe0')) 1615 self.assertTrue(re.match(r'\W(?u:\w)\W', '\xe0\xe0\xe0', re.ASCII)) 1616 1617 self.checkPatternError(r'(?a)(?-a:\w)', 1618 "bad inline flags: cannot turn off flags 'a', 'u' and 'L'", 8) 1619 self.checkPatternError(r'(?i-i:a)', 1620 'bad inline flags: flag turned on and off', 5) 1621 self.checkPatternError(r'(?au:a)', 1622 "bad inline flags: flags 'a', 'u' and 'L' are incompatible", 4) 1623 self.checkPatternError(br'(?aL:a)', 1624 "bad inline flags: flags 'a', 'u' and 'L' are incompatible", 4) 1625 1626 self.checkPatternError(r'(?-', 'missing flag', 3) 1627 self.checkPatternError(r'(?-+', 'missing flag', 3) 1628 self.checkPatternError(r'(?-z', 'unknown flag', 3) 1629 self.checkPatternError(r'(?-i', 'missing :', 4) 1630 self.checkPatternError(r'(?-i)', 'missing :', 4) 1631 self.checkPatternError(r'(?-i+', 'missing :', 4) 1632 self.checkPatternError(r'(?-iz', 'unknown flag', 4) 1633 self.checkPatternError(r'(?i:', 'missing ), unterminated subpattern', 0) 1634 self.checkPatternError(r'(?i', 'missing -, : or )', 3) 1635 self.checkPatternError(r'(?i+', 'missing -, : or )', 3) 1636 self.checkPatternError(r'(?iz', 'unknown flag', 3) 1637 1638 def test_bug_6509(self): 1639 # Replacement strings of both types must parse properly. 1640 # all strings 1641 pat = re.compile(r'a(\w)') 1642 self.assertEqual(pat.sub('b\\1', 'ac'), 'bc') 1643 pat = re.compile('a(.)') 1644 self.assertEqual(pat.sub('b\\1', 'a\u1234'), 'b\u1234') 1645 pat = re.compile('..') 1646 self.assertEqual(pat.sub(lambda m: 'str', 'a5'), 'str') 1647 1648 # all bytes 1649 pat = re.compile(br'a(\w)') 1650 self.assertEqual(pat.sub(b'b\\1', b'ac'), b'bc') 1651 pat = re.compile(b'a(.)') 1652 self.assertEqual(pat.sub(b'b\\1', b'a\xCD'), b'b\xCD') 1653 pat = re.compile(b'..') 1654 self.assertEqual(pat.sub(lambda m: b'bytes', b'a5'), b'bytes') 1655 1656 def test_dealloc(self): 1657 # issue 3299: check for segfault in debug build 1658 import _sre 1659 # the overflow limit is different on wide and narrow builds and it 1660 # depends on the definition of SRE_CODE (see sre.h). 1661 # 2**128 should be big enough to overflow on both. For smaller values 1662 # a RuntimeError is raised instead of OverflowError. 1663 long_overflow = 2**128 1664 self.assertRaises(TypeError, re.finditer, "a", {}) 1665 with self.assertRaises(OverflowError): 1666 _sre.compile("abc", 0, [long_overflow], 0, {}, ()) 1667 with self.assertRaises(TypeError): 1668 _sre.compile({}, 0, [], 0, [], []) 1669 1670 def test_search_dot_unicode(self): 1671 self.assertTrue(re.search("123.*-", '123abc-')) 1672 self.assertTrue(re.search("123.*-", '123\xe9-')) 1673 self.assertTrue(re.search("123.*-", '123\u20ac-')) 1674 self.assertTrue(re.search("123.*-", '123\U0010ffff-')) 1675 self.assertTrue(re.search("123.*-", '123\xe9\u20ac\U0010ffff-')) 1676 1677 def test_compile(self): 1678 # Test return value when given string and pattern as parameter 1679 pattern = re.compile('random pattern') 1680 self.assertIsInstance(pattern, re.Pattern) 1681 same_pattern = re.compile(pattern) 1682 self.assertIsInstance(same_pattern, re.Pattern) 1683 self.assertIs(same_pattern, pattern) 1684 # Test behaviour when not given a string or pattern as parameter 1685 self.assertRaises(TypeError, re.compile, 0) 1686 1687 @bigmemtest(size=_2G, memuse=1) 1688 def test_large_search(self, size): 1689 # Issue #10182: indices were 32-bit-truncated. 1690 s = 'a' * size 1691 m = re.search('$', s) 1692 self.assertIsNotNone(m) 1693 self.assertEqual(m.start(), size) 1694 self.assertEqual(m.end(), size) 1695 1696 # The huge memuse is because of re.sub() using a list and a join() 1697 # to create the replacement result. 1698 @bigmemtest(size=_2G, memuse=16 + 2) 1699 def test_large_subn(self, size): 1700 # Issue #10182: indices were 32-bit-truncated. 1701 s = 'a' * size 1702 r, n = re.subn('', '', s) 1703 self.assertEqual(r, s) 1704 self.assertEqual(n, size + 1) 1705 1706 def test_bug_16688(self): 1707 # Issue 16688: Backreferences make case-insensitive regex fail on 1708 # non-ASCII strings. 1709 self.assertEqual(re.findall(r"(?i)(a)\1", "aa \u0100"), ['a']) 1710 self.assertEqual(re.match(r"(?s).{1,3}", "\u0100\u0100").span(), (0, 2)) 1711 1712 def test_repeat_minmax_overflow(self): 1713 # Issue #13169 1714 string = "x" * 100000 1715 self.assertEqual(re.match(r".{65535}", string).span(), (0, 65535)) 1716 self.assertEqual(re.match(r".{,65535}", string).span(), (0, 65535)) 1717 self.assertEqual(re.match(r".{65535,}?", string).span(), (0, 65535)) 1718 self.assertEqual(re.match(r".{65536}", string).span(), (0, 65536)) 1719 self.assertEqual(re.match(r".{,65536}", string).span(), (0, 65536)) 1720 self.assertEqual(re.match(r".{65536,}?", string).span(), (0, 65536)) 1721 # 2**128 should be big enough to overflow both SRE_CODE and Py_ssize_t. 1722 self.assertRaises(OverflowError, re.compile, r".{%d}" % 2**128) 1723 self.assertRaises(OverflowError, re.compile, r".{,%d}" % 2**128) 1724 self.assertRaises(OverflowError, re.compile, r".{%d,}?" % 2**128) 1725 self.assertRaises(OverflowError, re.compile, r".{%d,%d}" % (2**129, 2**128)) 1726 1727 @cpython_only 1728 def test_repeat_minmax_overflow_maxrepeat(self): 1729 try: 1730 from _sre import MAXREPEAT 1731 except ImportError: 1732 self.skipTest('requires _sre.MAXREPEAT constant') 1733 string = "x" * 100000 1734 self.assertIsNone(re.match(r".{%d}" % (MAXREPEAT - 1), string)) 1735 self.assertEqual(re.match(r".{,%d}" % (MAXREPEAT - 1), string).span(), 1736 (0, 100000)) 1737 self.assertIsNone(re.match(r".{%d,}?" % (MAXREPEAT - 1), string)) 1738 self.assertRaises(OverflowError, re.compile, r".{%d}" % MAXREPEAT) 1739 self.assertRaises(OverflowError, re.compile, r".{,%d}" % MAXREPEAT) 1740 self.assertRaises(OverflowError, re.compile, r".{%d,}?" % MAXREPEAT) 1741 1742 def test_backref_group_name_in_exception(self): 1743 # Issue 17341: Poor error message when compiling invalid regex 1744 self.checkPatternError('(?P=<foo>)', 1745 "bad character in group name '<foo>'", 4) 1746 1747 def test_group_name_in_exception(self): 1748 # Issue 17341: Poor error message when compiling invalid regex 1749 self.checkPatternError('(?P<?foo>)', 1750 "bad character in group name '?foo'", 4) 1751 1752 def test_issue17998(self): 1753 for reps in '*', '+', '?', '{1}': 1754 for mod in '', '?': 1755 pattern = '.' + reps + mod + 'yz' 1756 self.assertEqual(re.compile(pattern, re.S).findall('xyz'), 1757 ['xyz'], msg=pattern) 1758 pattern = pattern.encode() 1759 self.assertEqual(re.compile(pattern, re.S).findall(b'xyz'), 1760 [b'xyz'], msg=pattern) 1761 1762 def test_match_repr(self): 1763 for string in '[abracadabra]', S('[abracadabra]'): 1764 m = re.search(r'(.+)(.*?)\1', string) 1765 pattern = r"<(%s\.)?%s object; span=\(1, 12\), match='abracadabra'>" % ( 1766 type(m).__module__, type(m).__qualname__ 1767 ) 1768 self.assertRegex(repr(m), pattern) 1769 for string in (b'[abracadabra]', B(b'[abracadabra]'), 1770 bytearray(b'[abracadabra]'), 1771 memoryview(b'[abracadabra]')): 1772 m = re.search(br'(.+)(.*?)\1', string) 1773 pattern = r"<(%s\.)?%s object; span=\(1, 12\), match=b'abracadabra'>" % ( 1774 type(m).__module__, type(m).__qualname__ 1775 ) 1776 self.assertRegex(repr(m), pattern) 1777 1778 first, second = list(re.finditer("(aa)|(bb)", "aa bb")) 1779 pattern = r"<(%s\.)?%s object; span=\(0, 2\), match='aa'>" % ( 1780 type(second).__module__, type(second).__qualname__ 1781 ) 1782 self.assertRegex(repr(first), pattern) 1783 pattern = r"<(%s\.)?%s object; span=\(3, 5\), match='bb'>" % ( 1784 type(second).__module__, type(second).__qualname__ 1785 ) 1786 self.assertRegex(repr(second), pattern) 1787 1788 def test_zerowidth(self): 1789 # Issues 852532, 1647489, 3262, 25054. 1790 self.assertEqual(re.split(r"\b", "a::bc"), ['', 'a', '::', 'bc', '']) 1791 self.assertEqual(re.split(r"\b|:+", "a::bc"), ['', 'a', '', '', 'bc', '']) 1792 self.assertEqual(re.split(r"(?<!\w)(?=\w)|:+", "a::bc"), ['', 'a', '', 'bc']) 1793 self.assertEqual(re.split(r"(?<=\w)(?!\w)|:+", "a::bc"), ['a', '', 'bc', '']) 1794 1795 self.assertEqual(re.sub(r"\b", "-", "a::bc"), '-a-::-bc-') 1796 self.assertEqual(re.sub(r"\b|:+", "-", "a::bc"), '-a---bc-') 1797 self.assertEqual(re.sub(r"(\b|:+)", r"[\1]", "a::bc"), '[]a[][::][]bc[]') 1798 1799 self.assertEqual(re.findall(r"\b|:+", "a::bc"), ['', '', '::', '', '']) 1800 self.assertEqual(re.findall(r"\b|\w+", "a::bc"), 1801 ['', 'a', '', '', 'bc', '']) 1802 1803 self.assertEqual([m.span() for m in re.finditer(r"\b|:+", "a::bc")], 1804 [(0, 0), (1, 1), (1, 3), (3, 3), (5, 5)]) 1805 self.assertEqual([m.span() for m in re.finditer(r"\b|\w+", "a::bc")], 1806 [(0, 0), (0, 1), (1, 1), (3, 3), (3, 5), (5, 5)]) 1807 1808 def test_bug_2537(self): 1809 # issue 2537: empty submatches 1810 for outer_op in ('{0,}', '*', '+', '{1,187}'): 1811 for inner_op in ('{0,}', '*', '?'): 1812 r = re.compile("^((x|y)%s)%s" % (inner_op, outer_op)) 1813 m = r.match("xyyzy") 1814 self.assertEqual(m.group(0), "xyy") 1815 self.assertEqual(m.group(1), "") 1816 self.assertEqual(m.group(2), "y") 1817 1818 @cpython_only 1819 def test_debug_flag(self): 1820 pat = r'(\.)(?:[ch]|py)(?(1)$|: )' 1821 with captured_stdout() as out: 1822 re.compile(pat, re.DEBUG) 1823 self.maxDiff = None 1824 dump = '''\ 1825SUBPATTERN 1 0 0 1826 LITERAL 46 1827BRANCH 1828 IN 1829 LITERAL 99 1830 LITERAL 104 1831OR 1832 LITERAL 112 1833 LITERAL 121 1834GROUPREF_EXISTS 1 1835 AT AT_END 1836ELSE 1837 LITERAL 58 1838 LITERAL 32 1839 1840 0. INFO 8 0b1 2 5 (to 9) 1841 prefix_skip 0 1842 prefix [0x2e] ('.') 1843 overlap [0] 1844 9: MARK 0 184511. LITERAL 0x2e ('.') 184613. MARK 1 184715. BRANCH 10 (to 26) 184817. IN 6 (to 24) 184919. LITERAL 0x63 ('c') 185021. LITERAL 0x68 ('h') 185123. FAILURE 185224: JUMP 9 (to 34) 185326: branch 7 (to 33) 185427. LITERAL 0x70 ('p') 185529. LITERAL 0x79 ('y') 185631. JUMP 2 (to 34) 185733: FAILURE 185834: GROUPREF_EXISTS 0 6 (to 41) 185937. AT END 186039. JUMP 5 (to 45) 186141: LITERAL 0x3a (':') 186243. LITERAL 0x20 (' ') 186345: SUCCESS 1864''' 1865 self.assertEqual(out.getvalue(), dump) 1866 # Debug output is output again even a second time (bypassing 1867 # the cache -- issue #20426). 1868 with captured_stdout() as out: 1869 re.compile(pat, re.DEBUG) 1870 self.assertEqual(out.getvalue(), dump) 1871 1872 def test_keyword_parameters(self): 1873 # Issue #20283: Accepting the string keyword parameter. 1874 pat = re.compile(r'(ab)') 1875 self.assertEqual( 1876 pat.match(string='abracadabra', pos=7, endpos=10).span(), (7, 9)) 1877 self.assertEqual( 1878 pat.fullmatch(string='abracadabra', pos=7, endpos=9).span(), (7, 9)) 1879 self.assertEqual( 1880 pat.search(string='abracadabra', pos=3, endpos=10).span(), (7, 9)) 1881 self.assertEqual( 1882 pat.findall(string='abracadabra', pos=3, endpos=10), ['ab']) 1883 self.assertEqual( 1884 pat.split(string='abracadabra', maxsplit=1), 1885 ['', 'ab', 'racadabra']) 1886 self.assertEqual( 1887 pat.scanner(string='abracadabra', pos=3, endpos=10).search().span(), 1888 (7, 9)) 1889 1890 def test_bug_20998(self): 1891 # Issue #20998: Fullmatch of repeated single character pattern 1892 # with ignore case. 1893 self.assertEqual(re.fullmatch('[a-c]+', 'ABC', re.I).span(), (0, 3)) 1894 1895 def test_locale_caching(self): 1896 # Issue #22410 1897 oldlocale = locale.setlocale(locale.LC_CTYPE) 1898 self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale) 1899 for loc in 'en_US.iso88591', 'en_US.utf8': 1900 try: 1901 locale.setlocale(locale.LC_CTYPE, loc) 1902 except locale.Error: 1903 # Unsupported locale on this system 1904 self.skipTest('test needs %s locale' % loc) 1905 1906 re.purge() 1907 self.check_en_US_iso88591() 1908 self.check_en_US_utf8() 1909 re.purge() 1910 self.check_en_US_utf8() 1911 self.check_en_US_iso88591() 1912 1913 def check_en_US_iso88591(self): 1914 locale.setlocale(locale.LC_CTYPE, 'en_US.iso88591') 1915 self.assertTrue(re.match(b'\xc5\xe5', b'\xc5\xe5', re.L|re.I)) 1916 self.assertTrue(re.match(b'\xc5', b'\xe5', re.L|re.I)) 1917 self.assertTrue(re.match(b'\xe5', b'\xc5', re.L|re.I)) 1918 self.assertTrue(re.match(b'(?Li)\xc5\xe5', b'\xc5\xe5')) 1919 self.assertTrue(re.match(b'(?Li)\xc5', b'\xe5')) 1920 self.assertTrue(re.match(b'(?Li)\xe5', b'\xc5')) 1921 1922 def check_en_US_utf8(self): 1923 locale.setlocale(locale.LC_CTYPE, 'en_US.utf8') 1924 self.assertTrue(re.match(b'\xc5\xe5', b'\xc5\xe5', re.L|re.I)) 1925 self.assertIsNone(re.match(b'\xc5', b'\xe5', re.L|re.I)) 1926 self.assertIsNone(re.match(b'\xe5', b'\xc5', re.L|re.I)) 1927 self.assertTrue(re.match(b'(?Li)\xc5\xe5', b'\xc5\xe5')) 1928 self.assertIsNone(re.match(b'(?Li)\xc5', b'\xe5')) 1929 self.assertIsNone(re.match(b'(?Li)\xe5', b'\xc5')) 1930 1931 def test_locale_compiled(self): 1932 oldlocale = locale.setlocale(locale.LC_CTYPE) 1933 self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale) 1934 for loc in 'en_US.iso88591', 'en_US.utf8': 1935 try: 1936 locale.setlocale(locale.LC_CTYPE, loc) 1937 except locale.Error: 1938 # Unsupported locale on this system 1939 self.skipTest('test needs %s locale' % loc) 1940 1941 locale.setlocale(locale.LC_CTYPE, 'en_US.iso88591') 1942 p1 = re.compile(b'\xc5\xe5', re.L|re.I) 1943 p2 = re.compile(b'[a\xc5][a\xe5]', re.L|re.I) 1944 p3 = re.compile(b'[az\xc5][az\xe5]', re.L|re.I) 1945 p4 = re.compile(b'[^\xc5][^\xe5]', re.L|re.I) 1946 for p in p1, p2, p3: 1947 self.assertTrue(p.match(b'\xc5\xe5')) 1948 self.assertTrue(p.match(b'\xe5\xe5')) 1949 self.assertTrue(p.match(b'\xc5\xc5')) 1950 self.assertIsNone(p4.match(b'\xe5\xc5')) 1951 self.assertIsNone(p4.match(b'\xe5\xe5')) 1952 self.assertIsNone(p4.match(b'\xc5\xc5')) 1953 1954 locale.setlocale(locale.LC_CTYPE, 'en_US.utf8') 1955 for p in p1, p2, p3: 1956 self.assertTrue(p.match(b'\xc5\xe5')) 1957 self.assertIsNone(p.match(b'\xe5\xe5')) 1958 self.assertIsNone(p.match(b'\xc5\xc5')) 1959 self.assertTrue(p4.match(b'\xe5\xc5')) 1960 self.assertIsNone(p4.match(b'\xe5\xe5')) 1961 self.assertIsNone(p4.match(b'\xc5\xc5')) 1962 1963 def test_error(self): 1964 with self.assertRaises(re.error) as cm: 1965 re.compile('(\u20ac))') 1966 err = cm.exception 1967 self.assertIsInstance(err.pattern, str) 1968 self.assertEqual(err.pattern, '(\u20ac))') 1969 self.assertEqual(err.pos, 3) 1970 self.assertEqual(err.lineno, 1) 1971 self.assertEqual(err.colno, 4) 1972 self.assertIn(err.msg, str(err)) 1973 self.assertIn(' at position 3', str(err)) 1974 self.assertNotIn(' at position 3', err.msg) 1975 # Bytes pattern 1976 with self.assertRaises(re.error) as cm: 1977 re.compile(b'(\xa4))') 1978 err = cm.exception 1979 self.assertIsInstance(err.pattern, bytes) 1980 self.assertEqual(err.pattern, b'(\xa4))') 1981 self.assertEqual(err.pos, 3) 1982 # Multiline pattern 1983 with self.assertRaises(re.error) as cm: 1984 re.compile(""" 1985 ( 1986 abc 1987 ) 1988 ) 1989 ( 1990 """, re.VERBOSE) 1991 err = cm.exception 1992 self.assertEqual(err.pos, 77) 1993 self.assertEqual(err.lineno, 5) 1994 self.assertEqual(err.colno, 17) 1995 self.assertIn(err.msg, str(err)) 1996 self.assertIn(' at position 77', str(err)) 1997 self.assertIn('(line 5, column 17)', str(err)) 1998 1999 def test_misc_errors(self): 2000 self.checkPatternError(r'(', 'missing ), unterminated subpattern', 0) 2001 self.checkPatternError(r'((a|b)', 'missing ), unterminated subpattern', 0) 2002 self.checkPatternError(r'(a|b))', 'unbalanced parenthesis', 5) 2003 self.checkPatternError(r'(?P', 'unexpected end of pattern', 3) 2004 self.checkPatternError(r'(?z)', 'unknown extension ?z', 1) 2005 self.checkPatternError(r'(?iz)', 'unknown flag', 3) 2006 self.checkPatternError(r'(?i', 'missing -, : or )', 3) 2007 self.checkPatternError(r'(?#abc', 'missing ), unterminated comment', 0) 2008 self.checkPatternError(r'(?<', 'unexpected end of pattern', 3) 2009 self.checkPatternError(r'(?<>)', 'unknown extension ?<>', 1) 2010 self.checkPatternError(r'(?', 'unexpected end of pattern', 2) 2011 2012 def test_enum(self): 2013 # Issue #28082: Check that str(flag) returns a human readable string 2014 # instead of an integer 2015 self.assertIn('ASCII', str(re.A)) 2016 self.assertIn('DOTALL', str(re.S)) 2017 2018 def test_pattern_compare(self): 2019 pattern1 = re.compile('abc', re.IGNORECASE) 2020 2021 # equal to itself 2022 self.assertEqual(pattern1, pattern1) 2023 self.assertFalse(pattern1 != pattern1) 2024 2025 # equal 2026 re.purge() 2027 pattern2 = re.compile('abc', re.IGNORECASE) 2028 self.assertEqual(hash(pattern2), hash(pattern1)) 2029 self.assertEqual(pattern2, pattern1) 2030 2031 # not equal: different pattern 2032 re.purge() 2033 pattern3 = re.compile('XYZ', re.IGNORECASE) 2034 # Don't test hash(pattern3) != hash(pattern1) because there is no 2035 # warranty that hash values are different 2036 self.assertNotEqual(pattern3, pattern1) 2037 2038 # not equal: different flag (flags=0) 2039 re.purge() 2040 pattern4 = re.compile('abc') 2041 self.assertNotEqual(pattern4, pattern1) 2042 2043 # only == and != comparison operators are supported 2044 with self.assertRaises(TypeError): 2045 pattern1 < pattern2 2046 2047 def test_pattern_compare_bytes(self): 2048 pattern1 = re.compile(b'abc') 2049 2050 # equal: test bytes patterns 2051 re.purge() 2052 pattern2 = re.compile(b'abc') 2053 self.assertEqual(hash(pattern2), hash(pattern1)) 2054 self.assertEqual(pattern2, pattern1) 2055 2056 # not equal: pattern of a different types (str vs bytes), 2057 # comparison must not raise a BytesWarning 2058 re.purge() 2059 pattern3 = re.compile('abc') 2060 with warnings.catch_warnings(): 2061 warnings.simplefilter('error', BytesWarning) 2062 self.assertNotEqual(pattern3, pattern1) 2063 2064 def test_bug_29444(self): 2065 s = bytearray(b'abcdefgh') 2066 m = re.search(b'[a-h]+', s) 2067 m2 = re.search(b'[e-h]+', s) 2068 self.assertEqual(m.group(), b'abcdefgh') 2069 self.assertEqual(m2.group(), b'efgh') 2070 s[:] = b'xyz' 2071 self.assertEqual(m.group(), b'xyz') 2072 self.assertEqual(m2.group(), b'') 2073 2074 def test_bug_34294(self): 2075 # Issue 34294: wrong capturing groups 2076 2077 # exists since Python 2 2078 s = "a\tx" 2079 p = r"\b(?=(\t)|(x))x" 2080 self.assertEqual(re.search(p, s).groups(), (None, 'x')) 2081 2082 # introduced in Python 3.7.0 2083 s = "ab" 2084 p = r"(?=(.)(.)?)" 2085 self.assertEqual(re.findall(p, s), 2086 [('a', 'b'), ('b', '')]) 2087 self.assertEqual([m.groups() for m in re.finditer(p, s)], 2088 [('a', 'b'), ('b', None)]) 2089 2090 # test-cases provided by issue34294, introduced in Python 3.7.0 2091 p = r"(?=<(?P<tag>\w+)/?>(?:(?P<text>.+?)</(?P=tag)>)?)" 2092 s = "<test><foo2/></test>" 2093 self.assertEqual(re.findall(p, s), 2094 [('test', '<foo2/>'), ('foo2', '')]) 2095 self.assertEqual([m.groupdict() for m in re.finditer(p, s)], 2096 [{'tag': 'test', 'text': '<foo2/>'}, 2097 {'tag': 'foo2', 'text': None}]) 2098 s = "<test>Hello</test><foo/>" 2099 self.assertEqual([m.groupdict() for m in re.finditer(p, s)], 2100 [{'tag': 'test', 'text': 'Hello'}, 2101 {'tag': 'foo', 'text': None}]) 2102 s = "<test>Hello</test><foo/><foo/>" 2103 self.assertEqual([m.groupdict() for m in re.finditer(p, s)], 2104 [{'tag': 'test', 'text': 'Hello'}, 2105 {'tag': 'foo', 'text': None}, 2106 {'tag': 'foo', 'text': None}]) 2107 2108 2109class PatternReprTests(unittest.TestCase): 2110 def check(self, pattern, expected): 2111 self.assertEqual(repr(re.compile(pattern)), expected) 2112 2113 def check_flags(self, pattern, flags, expected): 2114 self.assertEqual(repr(re.compile(pattern, flags)), expected) 2115 2116 def test_without_flags(self): 2117 self.check('random pattern', 2118 "re.compile('random pattern')") 2119 2120 def test_single_flag(self): 2121 self.check_flags('random pattern', re.IGNORECASE, 2122 "re.compile('random pattern', re.IGNORECASE)") 2123 2124 def test_multiple_flags(self): 2125 self.check_flags('random pattern', re.I|re.S|re.X, 2126 "re.compile('random pattern', " 2127 "re.IGNORECASE|re.DOTALL|re.VERBOSE)") 2128 2129 def test_unicode_flag(self): 2130 self.check_flags('random pattern', re.U, 2131 "re.compile('random pattern')") 2132 self.check_flags('random pattern', re.I|re.S|re.U, 2133 "re.compile('random pattern', " 2134 "re.IGNORECASE|re.DOTALL)") 2135 2136 def test_inline_flags(self): 2137 self.check('(?i)pattern', 2138 "re.compile('(?i)pattern', re.IGNORECASE)") 2139 2140 def test_unknown_flags(self): 2141 self.check_flags('random pattern', 0x123000, 2142 "re.compile('random pattern', 0x123000)") 2143 self.check_flags('random pattern', 0x123000|re.I, 2144 "re.compile('random pattern', re.IGNORECASE|0x123000)") 2145 2146 def test_bytes(self): 2147 self.check(b'bytes pattern', 2148 "re.compile(b'bytes pattern')") 2149 self.check_flags(b'bytes pattern', re.A, 2150 "re.compile(b'bytes pattern', re.ASCII)") 2151 2152 def test_locale(self): 2153 self.check_flags(b'bytes pattern', re.L, 2154 "re.compile(b'bytes pattern', re.LOCALE)") 2155 2156 def test_quotes(self): 2157 self.check('random "double quoted" pattern', 2158 '''re.compile('random "double quoted" pattern')''') 2159 self.check("random 'single quoted' pattern", 2160 '''re.compile("random 'single quoted' pattern")''') 2161 self.check('''both 'single' and "double" quotes''', 2162 '''re.compile('both \\'single\\' and "double" quotes')''') 2163 2164 def test_long_pattern(self): 2165 pattern = 'Very %spattern' % ('long ' * 1000) 2166 r = repr(re.compile(pattern)) 2167 self.assertLess(len(r), 300) 2168 self.assertEqual(r[:30], "re.compile('Very long long lon") 2169 r = repr(re.compile(pattern, re.I)) 2170 self.assertLess(len(r), 300) 2171 self.assertEqual(r[:30], "re.compile('Very long long lon") 2172 self.assertEqual(r[-16:], ", re.IGNORECASE)") 2173 2174 def test_flags_repr(self): 2175 self.assertEqual(repr(re.I), "re.IGNORECASE") 2176 self.assertEqual(repr(re.I|re.S|re.X), 2177 "re.IGNORECASE|re.DOTALL|re.VERBOSE") 2178 self.assertEqual(repr(re.I|re.S|re.X|(1<<20)), 2179 "re.IGNORECASE|re.DOTALL|re.VERBOSE|0x100000") 2180 self.assertEqual(repr(~re.I), "~re.IGNORECASE") 2181 self.assertEqual(repr(~(re.I|re.S|re.X)), 2182 "~(re.IGNORECASE|re.DOTALL|re.VERBOSE)") 2183 self.assertEqual(repr(~(re.I|re.S|re.X|(1<<20))), 2184 "~(re.IGNORECASE|re.DOTALL|re.VERBOSE|0x100000)") 2185 2186 2187class ImplementationTest(unittest.TestCase): 2188 """ 2189 Test implementation details of the re module. 2190 """ 2191 2192 @cpython_only 2193 def test_immutable(self): 2194 # bpo-43908: check that re types are immutable 2195 with self.assertRaises(TypeError): 2196 re.Match.foo = 1 2197 with self.assertRaises(TypeError): 2198 re.Pattern.foo = 1 2199 with self.assertRaises(TypeError): 2200 pat = re.compile("") 2201 tp = type(pat.scanner("")) 2202 tp.foo = 1 2203 2204 def test_overlap_table(self): 2205 f = sre_compile._generate_overlap_table 2206 self.assertEqual(f(""), []) 2207 self.assertEqual(f("a"), [0]) 2208 self.assertEqual(f("abcd"), [0, 0, 0, 0]) 2209 self.assertEqual(f("aaaa"), [0, 1, 2, 3]) 2210 self.assertEqual(f("ababba"), [0, 0, 1, 2, 0, 1]) 2211 self.assertEqual(f("abcabdac"), [0, 0, 0, 1, 2, 0, 1, 0]) 2212 2213 def test_signedness(self): 2214 self.assertGreaterEqual(sre_compile.MAXREPEAT, 0) 2215 self.assertGreaterEqual(sre_compile.MAXGROUPS, 0) 2216 2217 @cpython_only 2218 def test_disallow_instantiation(self): 2219 # Ensure that the type disallows instantiation (bpo-43916) 2220 check_disallow_instantiation(self, re.Match) 2221 check_disallow_instantiation(self, re.Pattern) 2222 pat = re.compile("") 2223 check_disallow_instantiation(self, type(pat.scanner(""))) 2224 2225 2226class ExternalTests(unittest.TestCase): 2227 2228 def test_re_benchmarks(self): 2229 're_tests benchmarks' 2230 from test.re_tests import benchmarks 2231 for pattern, s in benchmarks: 2232 with self.subTest(pattern=pattern, string=s): 2233 p = re.compile(pattern) 2234 self.assertTrue(p.search(s)) 2235 self.assertTrue(p.match(s)) 2236 self.assertTrue(p.fullmatch(s)) 2237 s2 = ' '*10000 + s + ' '*10000 2238 self.assertTrue(p.search(s2)) 2239 self.assertTrue(p.match(s2, 10000)) 2240 self.assertTrue(p.match(s2, 10000, 10000 + len(s))) 2241 self.assertTrue(p.fullmatch(s2, 10000, 10000 + len(s))) 2242 2243 def test_re_tests(self): 2244 're_tests test suite' 2245 from test.re_tests import tests, FAIL, SYNTAX_ERROR 2246 for t in tests: 2247 pattern = s = outcome = repl = expected = None 2248 if len(t) == 5: 2249 pattern, s, outcome, repl, expected = t 2250 elif len(t) == 3: 2251 pattern, s, outcome = t 2252 else: 2253 raise ValueError('Test tuples should have 3 or 5 fields', t) 2254 2255 with self.subTest(pattern=pattern, string=s): 2256 if outcome == SYNTAX_ERROR: # Expected a syntax error 2257 with self.assertRaises(re.error): 2258 re.compile(pattern) 2259 continue 2260 2261 obj = re.compile(pattern) 2262 result = obj.search(s) 2263 if outcome == FAIL: 2264 self.assertIsNone(result, 'Succeeded incorrectly') 2265 continue 2266 2267 with self.subTest(): 2268 self.assertTrue(result, 'Failed incorrectly') 2269 # Matched, as expected, so now we compute the 2270 # result string and compare it to our expected result. 2271 start, end = result.span(0) 2272 vardict = {'found': result.group(0), 2273 'groups': result.group(), 2274 'flags': result.re.flags} 2275 for i in range(1, 100): 2276 try: 2277 gi = result.group(i) 2278 # Special hack because else the string concat fails: 2279 if gi is None: 2280 gi = "None" 2281 except IndexError: 2282 gi = "Error" 2283 vardict['g%d' % i] = gi 2284 for i in result.re.groupindex.keys(): 2285 try: 2286 gi = result.group(i) 2287 if gi is None: 2288 gi = "None" 2289 except IndexError: 2290 gi = "Error" 2291 vardict[i] = gi 2292 self.assertEqual(eval(repl, vardict), expected, 2293 'grouping error') 2294 2295 # Try the match with both pattern and string converted to 2296 # bytes, and check that it still succeeds. 2297 try: 2298 bpat = bytes(pattern, "ascii") 2299 bs = bytes(s, "ascii") 2300 except UnicodeEncodeError: 2301 # skip non-ascii tests 2302 pass 2303 else: 2304 with self.subTest('bytes pattern match'): 2305 obj = re.compile(bpat) 2306 self.assertTrue(obj.search(bs)) 2307 2308 # Try the match with LOCALE enabled, and check that it 2309 # still succeeds. 2310 with self.subTest('locale-sensitive match'): 2311 obj = re.compile(bpat, re.LOCALE) 2312 result = obj.search(bs) 2313 if result is None: 2314 print('=== Fails on locale-sensitive match', t) 2315 2316 # Try the match with the search area limited to the extent 2317 # of the match and see if it still succeeds. \B will 2318 # break (because it won't match at the end or start of a 2319 # string), so we'll ignore patterns that feature it. 2320 if (pattern[:2] != r'\B' and pattern[-2:] != r'\B' 2321 and result is not None): 2322 with self.subTest('range-limited match'): 2323 obj = re.compile(pattern) 2324 self.assertTrue(obj.search(s, start, end + 1)) 2325 2326 # Try the match with IGNORECASE enabled, and check that it 2327 # still succeeds. 2328 with self.subTest('case-insensitive match'): 2329 obj = re.compile(pattern, re.IGNORECASE) 2330 self.assertTrue(obj.search(s)) 2331 2332 # Try the match with UNICODE locale enabled, and check 2333 # that it still succeeds. 2334 with self.subTest('unicode-sensitive match'): 2335 obj = re.compile(pattern, re.UNICODE) 2336 self.assertTrue(obj.search(s)) 2337 2338 2339if __name__ == "__main__": 2340 unittest.main() 2341