1from test.support import (gc_collect, bigmemtest, _2G, 2 cpython_only, captured_stdout) 3import locale 4import re 5import sre_compile 6import string 7import unittest 8import warnings 9from re import Scanner 10from weakref import proxy 11 12# Misc tests from Tim Peters' re.doc 13 14# WARNING: Don't change details in these tests if you don't know 15# what you're doing. Some of these tests were carefully modeled to 16# cover most of the code. 17 18class S(str): 19 def __getitem__(self, index): 20 return S(super().__getitem__(index)) 21 22class B(bytes): 23 def __getitem__(self, index): 24 return B(super().__getitem__(index)) 25 26class ReTests(unittest.TestCase): 27 28 def assertTypedEqual(self, actual, expect, msg=None): 29 self.assertEqual(actual, expect, msg) 30 def recurse(actual, expect): 31 if isinstance(expect, (tuple, list)): 32 for x, y in zip(actual, expect): 33 recurse(x, y) 34 else: 35 self.assertIs(type(actual), type(expect), msg) 36 recurse(actual, expect) 37 38 def checkPatternError(self, pattern, errmsg, pos=None): 39 with self.assertRaises(re.error) as cm: 40 re.compile(pattern) 41 with self.subTest(pattern=pattern): 42 err = cm.exception 43 self.assertEqual(err.msg, errmsg) 44 if pos is not None: 45 self.assertEqual(err.pos, pos) 46 47 def checkTemplateError(self, pattern, repl, string, errmsg, pos=None): 48 with self.assertRaises(re.error) as cm: 49 re.sub(pattern, repl, string) 50 with self.subTest(pattern=pattern, repl=repl): 51 err = cm.exception 52 self.assertEqual(err.msg, errmsg) 53 if pos is not None: 54 self.assertEqual(err.pos, pos) 55 56 def test_keep_buffer(self): 57 # See bug 14212 58 b = bytearray(b'x') 59 it = re.finditer(b'a', b) 60 with self.assertRaises(BufferError): 61 b.extend(b'x'*400) 62 list(it) 63 del it 64 gc_collect() 65 b.extend(b'x'*400) 66 67 def test_weakref(self): 68 s = 'QabbbcR' 69 x = re.compile('ab+c') 70 y = proxy(x) 71 self.assertEqual(x.findall('QabbbcR'), y.findall('QabbbcR')) 72 73 def test_search_star_plus(self): 74 self.assertEqual(re.search('x*', 'axx').span(0), (0, 0)) 75 self.assertEqual(re.search('x*', 'axx').span(), (0, 0)) 76 self.assertEqual(re.search('x+', 'axx').span(0), (1, 3)) 77 self.assertEqual(re.search('x+', 'axx').span(), (1, 3)) 78 self.assertIsNone(re.search('x', 'aaa')) 79 self.assertEqual(re.match('a*', 'xxx').span(0), (0, 0)) 80 self.assertEqual(re.match('a*', 'xxx').span(), (0, 0)) 81 self.assertEqual(re.match('x*', 'xxxa').span(0), (0, 3)) 82 self.assertEqual(re.match('x*', 'xxxa').span(), (0, 3)) 83 self.assertIsNone(re.match('a+', 'xxx')) 84 85 def bump_num(self, matchobj): 86 int_value = int(matchobj.group(0)) 87 return str(int_value + 1) 88 89 def test_basic_re_sub(self): 90 self.assertTypedEqual(re.sub('y', 'a', 'xyz'), 'xaz') 91 self.assertTypedEqual(re.sub('y', S('a'), S('xyz')), 'xaz') 92 self.assertTypedEqual(re.sub(b'y', b'a', b'xyz'), b'xaz') 93 self.assertTypedEqual(re.sub(b'y', B(b'a'), B(b'xyz')), b'xaz') 94 self.assertTypedEqual(re.sub(b'y', bytearray(b'a'), bytearray(b'xyz')), b'xaz') 95 self.assertTypedEqual(re.sub(b'y', memoryview(b'a'), memoryview(b'xyz')), b'xaz') 96 for y in ("\xe0", "\u0430", "\U0001d49c"): 97 self.assertEqual(re.sub(y, 'a', 'x%sz' % y), 'xaz') 98 99 self.assertEqual(re.sub("(?i)b+", "x", "bbbb BBBB"), 'x x') 100 self.assertEqual(re.sub(r'\d+', self.bump_num, '08.2 -2 23x99y'), 101 '9.3 -3 24x100y') 102 self.assertEqual(re.sub(r'\d+', self.bump_num, '08.2 -2 23x99y', 3), 103 '9.3 -3 23x99y') 104 self.assertEqual(re.sub(r'\d+', self.bump_num, '08.2 -2 23x99y', count=3), 105 '9.3 -3 23x99y') 106 107 self.assertEqual(re.sub('.', lambda m: r"\n", 'x'), '\\n') 108 self.assertEqual(re.sub('.', r"\n", 'x'), '\n') 109 110 s = r"\1\1" 111 self.assertEqual(re.sub('(.)', s, 'x'), 'xx') 112 self.assertEqual(re.sub('(.)', s.replace('\\', r'\\'), 'x'), s) 113 self.assertEqual(re.sub('(.)', lambda m: s, 'x'), s) 114 115 self.assertEqual(re.sub('(?P<a>x)', r'\g<a>\g<a>', 'xx'), 'xxxx') 116 self.assertEqual(re.sub('(?P<a>x)', r'\g<a>\g<1>', 'xx'), 'xxxx') 117 self.assertEqual(re.sub('(?P<unk>x)', r'\g<unk>\g<unk>', 'xx'), 'xxxx') 118 self.assertEqual(re.sub('(?P<unk>x)', r'\g<1>\g<1>', 'xx'), 'xxxx') 119 120 self.assertEqual(re.sub('a', r'\t\n\v\r\f\a\b', 'a'), '\t\n\v\r\f\a\b') 121 self.assertEqual(re.sub('a', '\t\n\v\r\f\a\b', 'a'), '\t\n\v\r\f\a\b') 122 self.assertEqual(re.sub('a', '\t\n\v\r\f\a\b', 'a'), 123 (chr(9)+chr(10)+chr(11)+chr(13)+chr(12)+chr(7)+chr(8))) 124 for c in 'cdehijklmopqsuwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ': 125 with self.subTest(c): 126 with self.assertRaises(re.error): 127 self.assertEqual(re.sub('a', '\\' + c, 'a'), '\\' + c) 128 129 self.assertEqual(re.sub(r'^\s*', 'X', 'test'), 'Xtest') 130 131 def test_bug_449964(self): 132 # fails for group followed by other escape 133 self.assertEqual(re.sub(r'(?P<unk>x)', r'\g<1>\g<1>\b', 'xx'), 134 'xx\bxx\b') 135 136 def test_bug_449000(self): 137 # Test for sub() on escaped characters 138 self.assertEqual(re.sub(r'\r\n', r'\n', 'abc\r\ndef\r\n'), 139 'abc\ndef\n') 140 self.assertEqual(re.sub('\r\n', r'\n', 'abc\r\ndef\r\n'), 141 'abc\ndef\n') 142 self.assertEqual(re.sub(r'\r\n', '\n', 'abc\r\ndef\r\n'), 143 'abc\ndef\n') 144 self.assertEqual(re.sub('\r\n', '\n', 'abc\r\ndef\r\n'), 145 'abc\ndef\n') 146 147 def test_bug_1661(self): 148 # Verify that flags do not get silently ignored with compiled patterns 149 pattern = re.compile('.') 150 self.assertRaises(ValueError, re.match, pattern, 'A', re.I) 151 self.assertRaises(ValueError, re.search, pattern, 'A', re.I) 152 self.assertRaises(ValueError, re.findall, pattern, 'A', re.I) 153 self.assertRaises(ValueError, re.compile, pattern, re.I) 154 155 def test_bug_3629(self): 156 # A regex that triggered a bug in the sre-code validator 157 re.compile("(?P<quote>)(?(quote))") 158 159 def test_sub_template_numeric_escape(self): 160 # bug 776311 and friends 161 self.assertEqual(re.sub('x', r'\0', 'x'), '\0') 162 self.assertEqual(re.sub('x', r'\000', 'x'), '\000') 163 self.assertEqual(re.sub('x', r'\001', 'x'), '\001') 164 self.assertEqual(re.sub('x', r'\008', 'x'), '\0' + '8') 165 self.assertEqual(re.sub('x', r'\009', 'x'), '\0' + '9') 166 self.assertEqual(re.sub('x', r'\111', 'x'), '\111') 167 self.assertEqual(re.sub('x', r'\117', 'x'), '\117') 168 self.assertEqual(re.sub('x', r'\377', 'x'), '\377') 169 170 self.assertEqual(re.sub('x', r'\1111', 'x'), '\1111') 171 self.assertEqual(re.sub('x', r'\1111', 'x'), '\111' + '1') 172 173 self.assertEqual(re.sub('x', r'\00', 'x'), '\x00') 174 self.assertEqual(re.sub('x', r'\07', 'x'), '\x07') 175 self.assertEqual(re.sub('x', r'\08', 'x'), '\0' + '8') 176 self.assertEqual(re.sub('x', r'\09', 'x'), '\0' + '9') 177 self.assertEqual(re.sub('x', r'\0a', 'x'), '\0' + 'a') 178 179 self.checkTemplateError('x', r'\400', 'x', 180 r'octal escape value \400 outside of ' 181 r'range 0-0o377', 0) 182 self.checkTemplateError('x', r'\777', 'x', 183 r'octal escape value \777 outside of ' 184 r'range 0-0o377', 0) 185 186 self.checkTemplateError('x', r'\1', 'x', 'invalid group reference 1', 1) 187 self.checkTemplateError('x', r'\8', 'x', 'invalid group reference 8', 1) 188 self.checkTemplateError('x', r'\9', 'x', 'invalid group reference 9', 1) 189 self.checkTemplateError('x', r'\11', 'x', 'invalid group reference 11', 1) 190 self.checkTemplateError('x', r'\18', 'x', 'invalid group reference 18', 1) 191 self.checkTemplateError('x', r'\1a', 'x', 'invalid group reference 1', 1) 192 self.checkTemplateError('x', r'\90', 'x', 'invalid group reference 90', 1) 193 self.checkTemplateError('x', r'\99', 'x', 'invalid group reference 99', 1) 194 self.checkTemplateError('x', r'\118', 'x', 'invalid group reference 11', 1) 195 self.checkTemplateError('x', r'\11a', 'x', 'invalid group reference 11', 1) 196 self.checkTemplateError('x', r'\181', 'x', 'invalid group reference 18', 1) 197 self.checkTemplateError('x', r'\800', 'x', 'invalid group reference 80', 1) 198 self.checkTemplateError('x', r'\8', '', 'invalid group reference 8', 1) 199 200 # in python2.3 (etc), these loop endlessly in sre_parser.py 201 self.assertEqual(re.sub('(((((((((((x)))))))))))', r'\11', 'x'), 'x') 202 self.assertEqual(re.sub('((((((((((y))))))))))(.)', r'\118', 'xyz'), 203 'xz8') 204 self.assertEqual(re.sub('((((((((((y))))))))))(.)', r'\11a', 'xyz'), 205 'xza') 206 207 def test_qualified_re_sub(self): 208 self.assertEqual(re.sub('a', 'b', 'aaaaa'), 'bbbbb') 209 self.assertEqual(re.sub('a', 'b', 'aaaaa', 1), 'baaaa') 210 self.assertEqual(re.sub('a', 'b', 'aaaaa', count=1), 'baaaa') 211 212 def test_bug_114660(self): 213 self.assertEqual(re.sub(r'(\S)\s+(\S)', r'\1 \2', 'hello there'), 214 'hello there') 215 216 def test_symbolic_groups(self): 217 re.compile(r'(?P<a>x)(?P=a)(?(a)y)') 218 re.compile(r'(?P<a1>x)(?P=a1)(?(a1)y)') 219 re.compile(r'(?P<a1>x)\1(?(1)y)') 220 self.checkPatternError(r'(?P<a>)(?P<a>)', 221 "redefinition of group name 'a' as group 2; " 222 "was group 1") 223 self.checkPatternError(r'(?P<a>(?P=a))', 224 "cannot refer to an open group", 10) 225 self.checkPatternError(r'(?Pxy)', 'unknown extension ?Px') 226 self.checkPatternError(r'(?P<a>)(?P=a', 'missing ), unterminated name', 11) 227 self.checkPatternError(r'(?P=', 'missing group name', 4) 228 self.checkPatternError(r'(?P=)', 'missing group name', 4) 229 self.checkPatternError(r'(?P=1)', "bad character in group name '1'", 4) 230 self.checkPatternError(r'(?P=a)', "unknown group name 'a'") 231 self.checkPatternError(r'(?P=a1)', "unknown group name 'a1'") 232 self.checkPatternError(r'(?P=a.)', "bad character in group name 'a.'", 4) 233 self.checkPatternError(r'(?P<)', 'missing >, unterminated name', 4) 234 self.checkPatternError(r'(?P<a', 'missing >, unterminated name', 4) 235 self.checkPatternError(r'(?P<', 'missing group name', 4) 236 self.checkPatternError(r'(?P<>)', 'missing group name', 4) 237 self.checkPatternError(r'(?P<1>)', "bad character in group name '1'", 4) 238 self.checkPatternError(r'(?P<a.>)', "bad character in group name 'a.'", 4) 239 self.checkPatternError(r'(?(', 'missing group name', 3) 240 self.checkPatternError(r'(?())', 'missing group name', 3) 241 self.checkPatternError(r'(?(a))', "unknown group name 'a'", 3) 242 self.checkPatternError(r'(?(-1))', "bad character in group name '-1'", 3) 243 self.checkPatternError(r'(?(1a))', "bad character in group name '1a'", 3) 244 self.checkPatternError(r'(?(a.))', "bad character in group name 'a.'", 3) 245 # New valid/invalid identifiers in Python 3 246 re.compile('(?P<µ>x)(?P=µ)(?(µ)y)') 247 re.compile('(?P<>x)(?P=)(?()y)') 248 self.checkPatternError('(?P<©>x)', "bad character in group name '©'", 4) 249 # Support > 100 groups. 250 pat = '|'.join('x(?P<a%d>%x)y' % (i, i) for i in range(1, 200 + 1)) 251 pat = '(?:%s)(?(200)z|t)' % pat 252 self.assertEqual(re.match(pat, 'xc8yz').span(), (0, 5)) 253 254 def test_symbolic_refs(self): 255 self.checkTemplateError('(?P<a>x)', r'\g<a', 'xx', 256 'missing >, unterminated name', 3) 257 self.checkTemplateError('(?P<a>x)', r'\g<', 'xx', 258 'missing group name', 3) 259 self.checkTemplateError('(?P<a>x)', r'\g', 'xx', 'missing <', 2) 260 self.checkTemplateError('(?P<a>x)', r'\g<a a>', 'xx', 261 "bad character in group name 'a a'", 3) 262 self.checkTemplateError('(?P<a>x)', r'\g<>', 'xx', 263 'missing group name', 3) 264 self.checkTemplateError('(?P<a>x)', r'\g<1a1>', 'xx', 265 "bad character in group name '1a1'", 3) 266 self.checkTemplateError('(?P<a>x)', r'\g<2>', 'xx', 267 'invalid group reference 2', 3) 268 self.checkTemplateError('(?P<a>x)', r'\2', 'xx', 269 'invalid group reference 2', 1) 270 with self.assertRaisesRegex(IndexError, "unknown group name 'ab'"): 271 re.sub('(?P<a>x)', r'\g<ab>', 'xx') 272 self.assertEqual(re.sub('(?P<a>x)|(?P<b>y)', r'\g<b>', 'xx'), '') 273 self.assertEqual(re.sub('(?P<a>x)|(?P<b>y)', r'\2', 'xx'), '') 274 self.checkTemplateError('(?P<a>x)', r'\g<-1>', 'xx', 275 "bad character in group name '-1'", 3) 276 # New valid/invalid identifiers in Python 3 277 self.assertEqual(re.sub('(?P<µ>x)', r'\g<µ>', 'xx'), 'xx') 278 self.assertEqual(re.sub('(?P<>x)', r'\g<>', 'xx'), 'xx') 279 self.checkTemplateError('(?P<a>x)', r'\g<©>', 'xx', 280 "bad character in group name '©'", 3) 281 # Support > 100 groups. 282 pat = '|'.join('x(?P<a%d>%x)y' % (i, i) for i in range(1, 200 + 1)) 283 self.assertEqual(re.sub(pat, r'\g<200>', 'xc8yzxc8y'), 'c8zc8') 284 285 def test_re_subn(self): 286 self.assertEqual(re.subn("(?i)b+", "x", "bbbb BBBB"), ('x x', 2)) 287 self.assertEqual(re.subn("b+", "x", "bbbb BBBB"), ('x BBBB', 1)) 288 self.assertEqual(re.subn("b+", "x", "xyz"), ('xyz', 0)) 289 self.assertEqual(re.subn("b*", "x", "xyz"), ('xxxyxzx', 4)) 290 self.assertEqual(re.subn("b*", "x", "xyz", 2), ('xxxyz', 2)) 291 self.assertEqual(re.subn("b*", "x", "xyz", count=2), ('xxxyz', 2)) 292 293 def test_re_split(self): 294 for string in ":a:b::c", S(":a:b::c"): 295 self.assertTypedEqual(re.split(":", string), 296 ['', 'a', 'b', '', 'c']) 297 self.assertTypedEqual(re.split(":+", string), 298 ['', 'a', 'b', 'c']) 299 self.assertTypedEqual(re.split("(:+)", string), 300 ['', ':', 'a', ':', 'b', '::', 'c']) 301 for string in (b":a:b::c", B(b":a:b::c"), bytearray(b":a:b::c"), 302 memoryview(b":a:b::c")): 303 self.assertTypedEqual(re.split(b":", string), 304 [b'', b'a', b'b', b'', b'c']) 305 self.assertTypedEqual(re.split(b":+", string), 306 [b'', b'a', b'b', b'c']) 307 self.assertTypedEqual(re.split(b"(:+)", string), 308 [b'', b':', b'a', b':', b'b', b'::', b'c']) 309 for a, b, c in ("\xe0\xdf\xe7", "\u0430\u0431\u0432", 310 "\U0001d49c\U0001d49e\U0001d4b5"): 311 string = ":%s:%s::%s" % (a, b, c) 312 self.assertEqual(re.split(":", string), ['', a, b, '', c]) 313 self.assertEqual(re.split(":+", string), ['', a, b, c]) 314 self.assertEqual(re.split("(:+)", string), 315 ['', ':', a, ':', b, '::', c]) 316 317 self.assertEqual(re.split("(?::+)", ":a:b::c"), ['', 'a', 'b', 'c']) 318 self.assertEqual(re.split("(:)+", ":a:b::c"), 319 ['', ':', 'a', ':', 'b', ':', 'c']) 320 self.assertEqual(re.split("([b:]+)", ":a:b::c"), 321 ['', ':', 'a', ':b::', 'c']) 322 self.assertEqual(re.split("(b)|(:+)", ":a:b::c"), 323 ['', None, ':', 'a', None, ':', '', 'b', None, '', 324 None, '::', 'c']) 325 self.assertEqual(re.split("(?:b)|(?::+)", ":a:b::c"), 326 ['', 'a', '', '', 'c']) 327 328 for sep, expected in [ 329 (':*', ['', '', 'a', '', 'b', '', 'c', '']), 330 ('(?::*)', ['', '', 'a', '', 'b', '', 'c', '']), 331 ('(:*)', ['', ':', '', '', 'a', ':', '', '', 'b', '::', '', '', 'c', '', '']), 332 ('(:)*', ['', ':', '', None, 'a', ':', '', None, 'b', ':', '', None, 'c', None, '']), 333 ]: 334 with self.subTest(sep=sep): 335 self.assertTypedEqual(re.split(sep, ':a:b::c'), expected) 336 337 for sep, expected in [ 338 ('', ['', ':', 'a', ':', 'b', ':', ':', 'c', '']), 339 (r'\b', [':', 'a', ':', 'b', '::', 'c', '']), 340 (r'(?=:)', ['', ':a', ':b', ':', ':c']), 341 (r'(?<=:)', [':', 'a:', 'b:', ':', 'c']), 342 ]: 343 with self.subTest(sep=sep): 344 self.assertTypedEqual(re.split(sep, ':a:b::c'), expected) 345 346 def test_qualified_re_split(self): 347 self.assertEqual(re.split(":", ":a:b::c", 2), ['', 'a', 'b::c']) 348 self.assertEqual(re.split(":", ":a:b::c", maxsplit=2), ['', 'a', 'b::c']) 349 self.assertEqual(re.split(':', 'a:b:c:d', maxsplit=2), ['a', 'b', 'c:d']) 350 self.assertEqual(re.split("(:)", ":a:b::c", maxsplit=2), 351 ['', ':', 'a', ':', 'b::c']) 352 self.assertEqual(re.split("(:+)", ":a:b::c", maxsplit=2), 353 ['', ':', 'a', ':', 'b::c']) 354 self.assertEqual(re.split("(:*)", ":a:b::c", maxsplit=2), 355 ['', ':', '', '', 'a:b::c']) 356 357 def test_re_findall(self): 358 self.assertEqual(re.findall(":+", "abc"), []) 359 for string in "a:b::c:::d", S("a:b::c:::d"): 360 self.assertTypedEqual(re.findall(":+", string), 361 [":", "::", ":::"]) 362 self.assertTypedEqual(re.findall("(:+)", string), 363 [":", "::", ":::"]) 364 self.assertTypedEqual(re.findall("(:)(:*)", string), 365 [(":", ""), (":", ":"), (":", "::")]) 366 for string in (b"a:b::c:::d", B(b"a:b::c:::d"), bytearray(b"a:b::c:::d"), 367 memoryview(b"a:b::c:::d")): 368 self.assertTypedEqual(re.findall(b":+", string), 369 [b":", b"::", b":::"]) 370 self.assertTypedEqual(re.findall(b"(:+)", string), 371 [b":", b"::", b":::"]) 372 self.assertTypedEqual(re.findall(b"(:)(:*)", string), 373 [(b":", b""), (b":", b":"), (b":", b"::")]) 374 for x in ("\xe0", "\u0430", "\U0001d49c"): 375 xx = x * 2 376 xxx = x * 3 377 string = "a%sb%sc%sd" % (x, xx, xxx) 378 self.assertEqual(re.findall("%s+" % x, string), [x, xx, xxx]) 379 self.assertEqual(re.findall("(%s+)" % x, string), [x, xx, xxx]) 380 self.assertEqual(re.findall("(%s)(%s*)" % (x, x), string), 381 [(x, ""), (x, x), (x, xx)]) 382 383 def test_bug_117612(self): 384 self.assertEqual(re.findall(r"(a|(b))", "aba"), 385 [("a", ""),("b", "b"),("a", "")]) 386 387 def test_re_match(self): 388 for string in 'a', S('a'): 389 self.assertEqual(re.match('a', string).groups(), ()) 390 self.assertEqual(re.match('(a)', string).groups(), ('a',)) 391 self.assertEqual(re.match('(a)', string).group(0), 'a') 392 self.assertEqual(re.match('(a)', string).group(1), 'a') 393 self.assertEqual(re.match('(a)', string).group(1, 1), ('a', 'a')) 394 for string in b'a', B(b'a'), bytearray(b'a'), memoryview(b'a'): 395 self.assertEqual(re.match(b'a', string).groups(), ()) 396 self.assertEqual(re.match(b'(a)', string).groups(), (b'a',)) 397 self.assertEqual(re.match(b'(a)', string).group(0), b'a') 398 self.assertEqual(re.match(b'(a)', string).group(1), b'a') 399 self.assertEqual(re.match(b'(a)', string).group(1, 1), (b'a', b'a')) 400 for a in ("\xe0", "\u0430", "\U0001d49c"): 401 self.assertEqual(re.match(a, a).groups(), ()) 402 self.assertEqual(re.match('(%s)' % a, a).groups(), (a,)) 403 self.assertEqual(re.match('(%s)' % a, a).group(0), a) 404 self.assertEqual(re.match('(%s)' % a, a).group(1), a) 405 self.assertEqual(re.match('(%s)' % a, a).group(1, 1), (a, a)) 406 407 pat = re.compile('((a)|(b))(c)?') 408 self.assertEqual(pat.match('a').groups(), ('a', 'a', None, None)) 409 self.assertEqual(pat.match('b').groups(), ('b', None, 'b', None)) 410 self.assertEqual(pat.match('ac').groups(), ('a', 'a', None, 'c')) 411 self.assertEqual(pat.match('bc').groups(), ('b', None, 'b', 'c')) 412 self.assertEqual(pat.match('bc').groups(""), ('b', "", 'b', 'c')) 413 414 pat = re.compile('(?:(?P<a1>a)|(?P<b2>b))(?P<c3>c)?') 415 self.assertEqual(pat.match('a').group(1, 2, 3), ('a', None, None)) 416 self.assertEqual(pat.match('b').group('a1', 'b2', 'c3'), 417 (None, 'b', None)) 418 self.assertEqual(pat.match('ac').group(1, 'b2', 3), ('a', None, 'c')) 419 420 def test_group(self): 421 class Index: 422 def __init__(self, value): 423 self.value = value 424 def __index__(self): 425 return self.value 426 # A single group 427 m = re.match('(a)(b)', 'ab') 428 self.assertEqual(m.group(), 'ab') 429 self.assertEqual(m.group(0), 'ab') 430 self.assertEqual(m.group(1), 'a') 431 self.assertEqual(m.group(Index(1)), 'a') 432 self.assertRaises(IndexError, m.group, -1) 433 self.assertRaises(IndexError, m.group, 3) 434 self.assertRaises(IndexError, m.group, 1<<1000) 435 self.assertRaises(IndexError, m.group, Index(1<<1000)) 436 self.assertRaises(IndexError, m.group, 'x') 437 # Multiple groups 438 self.assertEqual(m.group(2, 1), ('b', 'a')) 439 self.assertEqual(m.group(Index(2), Index(1)), ('b', 'a')) 440 441 def test_match_getitem(self): 442 pat = re.compile('(?:(?P<a1>a)|(?P<b2>b))(?P<c3>c)?') 443 444 m = pat.match('a') 445 self.assertEqual(m['a1'], 'a') 446 self.assertEqual(m['b2'], None) 447 self.assertEqual(m['c3'], None) 448 self.assertEqual('a1={a1} b2={b2} c3={c3}'.format_map(m), 'a1=a b2=None c3=None') 449 self.assertEqual(m[0], 'a') 450 self.assertEqual(m[1], 'a') 451 self.assertEqual(m[2], None) 452 self.assertEqual(m[3], None) 453 with self.assertRaisesRegex(IndexError, 'no such group'): 454 m['X'] 455 with self.assertRaisesRegex(IndexError, 'no such group'): 456 m[-1] 457 with self.assertRaisesRegex(IndexError, 'no such group'): 458 m[4] 459 with self.assertRaisesRegex(IndexError, 'no such group'): 460 m[0, 1] 461 with self.assertRaisesRegex(IndexError, 'no such group'): 462 m[(0,)] 463 with self.assertRaisesRegex(IndexError, 'no such group'): 464 m[(0, 1)] 465 with self.assertRaisesRegex(IndexError, 'no such group'): 466 'a1={a2}'.format_map(m) 467 468 m = pat.match('ac') 469 self.assertEqual(m['a1'], 'a') 470 self.assertEqual(m['b2'], None) 471 self.assertEqual(m['c3'], 'c') 472 self.assertEqual('a1={a1} b2={b2} c3={c3}'.format_map(m), 'a1=a b2=None c3=c') 473 self.assertEqual(m[0], 'ac') 474 self.assertEqual(m[1], 'a') 475 self.assertEqual(m[2], None) 476 self.assertEqual(m[3], 'c') 477 478 # Cannot assign. 479 with self.assertRaises(TypeError): 480 m[0] = 1 481 482 # No len(). 483 self.assertRaises(TypeError, len, m) 484 485 def test_re_fullmatch(self): 486 # Issue 16203: Proposal: add re.fullmatch() method. 487 self.assertEqual(re.fullmatch(r"a", "a").span(), (0, 1)) 488 for string in "ab", S("ab"): 489 self.assertEqual(re.fullmatch(r"a|ab", string).span(), (0, 2)) 490 for string in b"ab", B(b"ab"), bytearray(b"ab"), memoryview(b"ab"): 491 self.assertEqual(re.fullmatch(br"a|ab", string).span(), (0, 2)) 492 for a, b in "\xe0\xdf", "\u0430\u0431", "\U0001d49c\U0001d49e": 493 r = r"%s|%s" % (a, a + b) 494 self.assertEqual(re.fullmatch(r, a + b).span(), (0, 2)) 495 self.assertEqual(re.fullmatch(r".*?$", "abc").span(), (0, 3)) 496 self.assertEqual(re.fullmatch(r".*?", "abc").span(), (0, 3)) 497 self.assertEqual(re.fullmatch(r"a.*?b", "ab").span(), (0, 2)) 498 self.assertEqual(re.fullmatch(r"a.*?b", "abb").span(), (0, 3)) 499 self.assertEqual(re.fullmatch(r"a.*?b", "axxb").span(), (0, 4)) 500 self.assertIsNone(re.fullmatch(r"a+", "ab")) 501 self.assertIsNone(re.fullmatch(r"abc$", "abc\n")) 502 self.assertIsNone(re.fullmatch(r"abc\Z", "abc\n")) 503 self.assertIsNone(re.fullmatch(r"(?m)abc$", "abc\n")) 504 self.assertEqual(re.fullmatch(r"ab(?=c)cd", "abcd").span(), (0, 4)) 505 self.assertEqual(re.fullmatch(r"ab(?<=b)cd", "abcd").span(), (0, 4)) 506 self.assertEqual(re.fullmatch(r"(?=a|ab)ab", "ab").span(), (0, 2)) 507 508 self.assertEqual( 509 re.compile(r"bc").fullmatch("abcd", pos=1, endpos=3).span(), (1, 3)) 510 self.assertEqual( 511 re.compile(r".*?$").fullmatch("abcd", pos=1, endpos=3).span(), (1, 3)) 512 self.assertEqual( 513 re.compile(r".*?").fullmatch("abcd", pos=1, endpos=3).span(), (1, 3)) 514 515 def test_re_groupref_exists(self): 516 self.assertEqual(re.match(r'^(\()?([^()]+)(?(1)\))$', '(a)').groups(), 517 ('(', 'a')) 518 self.assertEqual(re.match(r'^(\()?([^()]+)(?(1)\))$', 'a').groups(), 519 (None, 'a')) 520 self.assertIsNone(re.match(r'^(\()?([^()]+)(?(1)\))$', 'a)')) 521 self.assertIsNone(re.match(r'^(\()?([^()]+)(?(1)\))$', '(a')) 522 self.assertEqual(re.match('^(?:(a)|c)((?(1)b|d))$', 'ab').groups(), 523 ('a', 'b')) 524 self.assertEqual(re.match(r'^(?:(a)|c)((?(1)b|d))$', 'cd').groups(), 525 (None, 'd')) 526 self.assertEqual(re.match(r'^(?:(a)|c)((?(1)|d))$', 'cd').groups(), 527 (None, 'd')) 528 self.assertEqual(re.match(r'^(?:(a)|c)((?(1)|d))$', 'a').groups(), 529 ('a', '')) 530 531 # Tests for bug #1177831: exercise groups other than the first group 532 p = re.compile('(?P<g1>a)(?P<g2>b)?((?(g2)c|d))') 533 self.assertEqual(p.match('abc').groups(), 534 ('a', 'b', 'c')) 535 self.assertEqual(p.match('ad').groups(), 536 ('a', None, 'd')) 537 self.assertIsNone(p.match('abd')) 538 self.assertIsNone(p.match('ac')) 539 540 # Support > 100 groups. 541 pat = '|'.join('x(?P<a%d>%x)y' % (i, i) for i in range(1, 200 + 1)) 542 pat = '(?:%s)(?(200)z)' % pat 543 self.assertEqual(re.match(pat, 'xc8yz').span(), (0, 5)) 544 545 self.checkPatternError(r'(?P<a>)(?(0))', 'bad group number', 10) 546 self.checkPatternError(r'()(?(1)a|b', 547 'missing ), unterminated subpattern', 2) 548 self.checkPatternError(r'()(?(1)a|b|c)', 549 'conditional backref with more than ' 550 'two branches', 10) 551 552 def test_re_groupref_overflow(self): 553 from sre_constants import MAXGROUPS 554 self.checkTemplateError('()', r'\g<%s>' % MAXGROUPS, 'xx', 555 'invalid group reference %d' % MAXGROUPS, 3) 556 self.checkPatternError(r'(?P<a>)(?(%d))' % MAXGROUPS, 557 'invalid group reference %d' % MAXGROUPS, 10) 558 559 def test_re_groupref(self): 560 self.assertEqual(re.match(r'^(\|)?([^()]+)\1$', '|a|').groups(), 561 ('|', 'a')) 562 self.assertEqual(re.match(r'^(\|)?([^()]+)\1?$', 'a').groups(), 563 (None, 'a')) 564 self.assertIsNone(re.match(r'^(\|)?([^()]+)\1$', 'a|')) 565 self.assertIsNone(re.match(r'^(\|)?([^()]+)\1$', '|a')) 566 self.assertEqual(re.match(r'^(?:(a)|c)(\1)$', 'aa').groups(), 567 ('a', 'a')) 568 self.assertEqual(re.match(r'^(?:(a)|c)(\1)?$', 'c').groups(), 569 (None, None)) 570 571 self.checkPatternError(r'(abc\1)', 'cannot refer to an open group', 4) 572 573 def test_groupdict(self): 574 self.assertEqual(re.match('(?P<first>first) (?P<second>second)', 575 'first second').groupdict(), 576 {'first':'first', 'second':'second'}) 577 578 def test_expand(self): 579 self.assertEqual(re.match("(?P<first>first) (?P<second>second)", 580 "first second") 581 .expand(r"\2 \1 \g<second> \g<first>"), 582 "second first second first") 583 self.assertEqual(re.match("(?P<first>first)|(?P<second>second)", 584 "first") 585 .expand(r"\2 \g<second>"), 586 " ") 587 588 def test_repeat_minmax(self): 589 self.assertIsNone(re.match(r"^(\w){1}$", "abc")) 590 self.assertIsNone(re.match(r"^(\w){1}?$", "abc")) 591 self.assertIsNone(re.match(r"^(\w){1,2}$", "abc")) 592 self.assertIsNone(re.match(r"^(\w){1,2}?$", "abc")) 593 594 self.assertEqual(re.match(r"^(\w){3}$", "abc").group(1), "c") 595 self.assertEqual(re.match(r"^(\w){1,3}$", "abc").group(1), "c") 596 self.assertEqual(re.match(r"^(\w){1,4}$", "abc").group(1), "c") 597 self.assertEqual(re.match(r"^(\w){3,4}?$", "abc").group(1), "c") 598 self.assertEqual(re.match(r"^(\w){3}?$", "abc").group(1), "c") 599 self.assertEqual(re.match(r"^(\w){1,3}?$", "abc").group(1), "c") 600 self.assertEqual(re.match(r"^(\w){1,4}?$", "abc").group(1), "c") 601 self.assertEqual(re.match(r"^(\w){3,4}?$", "abc").group(1), "c") 602 603 self.assertIsNone(re.match(r"^x{1}$", "xxx")) 604 self.assertIsNone(re.match(r"^x{1}?$", "xxx")) 605 self.assertIsNone(re.match(r"^x{1,2}$", "xxx")) 606 self.assertIsNone(re.match(r"^x{1,2}?$", "xxx")) 607 608 self.assertTrue(re.match(r"^x{3}$", "xxx")) 609 self.assertTrue(re.match(r"^x{1,3}$", "xxx")) 610 self.assertTrue(re.match(r"^x{3,3}$", "xxx")) 611 self.assertTrue(re.match(r"^x{1,4}$", "xxx")) 612 self.assertTrue(re.match(r"^x{3,4}?$", "xxx")) 613 self.assertTrue(re.match(r"^x{3}?$", "xxx")) 614 self.assertTrue(re.match(r"^x{1,3}?$", "xxx")) 615 self.assertTrue(re.match(r"^x{1,4}?$", "xxx")) 616 self.assertTrue(re.match(r"^x{3,4}?$", "xxx")) 617 618 self.assertIsNone(re.match(r"^x{}$", "xxx")) 619 self.assertTrue(re.match(r"^x{}$", "x{}")) 620 621 self.checkPatternError(r'x{2,1}', 622 'min repeat greater than max repeat', 2) 623 624 def test_getattr(self): 625 self.assertEqual(re.compile("(?i)(a)(b)").pattern, "(?i)(a)(b)") 626 self.assertEqual(re.compile("(?i)(a)(b)").flags, re.I | re.U) 627 self.assertEqual(re.compile("(?i)(a)(b)").groups, 2) 628 self.assertEqual(re.compile("(?i)(a)(b)").groupindex, {}) 629 self.assertEqual(re.compile("(?i)(?P<first>a)(?P<other>b)").groupindex, 630 {'first': 1, 'other': 2}) 631 632 self.assertEqual(re.match("(a)", "a").pos, 0) 633 self.assertEqual(re.match("(a)", "a").endpos, 1) 634 self.assertEqual(re.match("(a)", "a").string, "a") 635 self.assertEqual(re.match("(a)", "a").regs, ((0, 1), (0, 1))) 636 self.assertTrue(re.match("(a)", "a").re) 637 638 # Issue 14260. groupindex should be non-modifiable mapping. 639 p = re.compile(r'(?i)(?P<first>a)(?P<other>b)') 640 self.assertEqual(sorted(p.groupindex), ['first', 'other']) 641 self.assertEqual(p.groupindex['other'], 2) 642 with self.assertRaises(TypeError): 643 p.groupindex['other'] = 0 644 self.assertEqual(p.groupindex['other'], 2) 645 646 def test_special_escapes(self): 647 self.assertEqual(re.search(r"\b(b.)\b", 648 "abcd abc bcd bx").group(1), "bx") 649 self.assertEqual(re.search(r"\B(b.)\B", 650 "abc bcd bc abxd").group(1), "bx") 651 self.assertEqual(re.search(r"\b(b.)\b", 652 "abcd abc bcd bx", re.ASCII).group(1), "bx") 653 self.assertEqual(re.search(r"\B(b.)\B", 654 "abc bcd bc abxd", re.ASCII).group(1), "bx") 655 self.assertEqual(re.search(r"^abc$", "\nabc\n", re.M).group(0), "abc") 656 self.assertEqual(re.search(r"^\Aabc\Z$", "abc", re.M).group(0), "abc") 657 self.assertIsNone(re.search(r"^\Aabc\Z$", "\nabc\n", re.M)) 658 self.assertEqual(re.search(br"\b(b.)\b", 659 b"abcd abc bcd bx").group(1), b"bx") 660 self.assertEqual(re.search(br"\B(b.)\B", 661 b"abc bcd bc abxd").group(1), b"bx") 662 self.assertEqual(re.search(br"\b(b.)\b", 663 b"abcd abc bcd bx", re.LOCALE).group(1), b"bx") 664 self.assertEqual(re.search(br"\B(b.)\B", 665 b"abc bcd bc abxd", re.LOCALE).group(1), b"bx") 666 self.assertEqual(re.search(br"^abc$", b"\nabc\n", re.M).group(0), b"abc") 667 self.assertEqual(re.search(br"^\Aabc\Z$", b"abc", re.M).group(0), b"abc") 668 self.assertIsNone(re.search(br"^\Aabc\Z$", b"\nabc\n", re.M)) 669 self.assertEqual(re.search(r"\d\D\w\W\s\S", 670 "1aa! a").group(0), "1aa! a") 671 self.assertEqual(re.search(br"\d\D\w\W\s\S", 672 b"1aa! a").group(0), b"1aa! a") 673 self.assertEqual(re.search(r"\d\D\w\W\s\S", 674 "1aa! a", re.ASCII).group(0), "1aa! a") 675 self.assertEqual(re.search(br"\d\D\w\W\s\S", 676 b"1aa! a", re.LOCALE).group(0), b"1aa! a") 677 678 def test_other_escapes(self): 679 self.checkPatternError("\\", 'bad escape (end of pattern)', 0) 680 self.assertEqual(re.match(r"\(", '(').group(), '(') 681 self.assertIsNone(re.match(r"\(", ')')) 682 self.assertEqual(re.match(r"\\", '\\').group(), '\\') 683 self.assertEqual(re.match(r"[\]]", ']').group(), ']') 684 self.assertIsNone(re.match(r"[\]]", '[')) 685 self.assertEqual(re.match(r"[a\-c]", '-').group(), '-') 686 self.assertIsNone(re.match(r"[a\-c]", 'b')) 687 self.assertEqual(re.match(r"[\^a]+", 'a^').group(), 'a^') 688 self.assertIsNone(re.match(r"[\^a]+", 'b')) 689 re.purge() # for warnings 690 for c in 'ceghijklmopqyzCEFGHIJKLMNOPQRTVXY': 691 with self.subTest(c): 692 self.assertRaises(re.error, re.compile, '\\%c' % c) 693 for c in 'ceghijklmopqyzABCEFGHIJKLMNOPQRTVXYZ': 694 with self.subTest(c): 695 self.assertRaises(re.error, re.compile, '[\\%c]' % c) 696 697 def test_named_unicode_escapes(self): 698 # test individual Unicode named escapes 699 self.assertTrue(re.match(r'\N{LESS-THAN SIGN}', '<')) 700 self.assertTrue(re.match(r'\N{less-than sign}', '<')) 701 self.assertIsNone(re.match(r'\N{LESS-THAN SIGN}', '>')) 702 self.assertTrue(re.match(r'\N{SNAKE}', '\U0001f40d')) 703 self.assertTrue(re.match(r'\N{ARABIC LIGATURE UIGHUR KIRGHIZ YEH WITH ' 704 r'HAMZA ABOVE WITH ALEF MAKSURA ISOLATED FORM}', 705 '\ufbf9')) 706 self.assertTrue(re.match(r'[\N{LESS-THAN SIGN}-\N{GREATER-THAN SIGN}]', 707 '=')) 708 self.assertIsNone(re.match(r'[\N{LESS-THAN SIGN}-\N{GREATER-THAN SIGN}]', 709 ';')) 710 711 # test errors in \N{name} handling - only valid names should pass 712 self.checkPatternError(r'\N', 'missing {', 2) 713 self.checkPatternError(r'[\N]', 'missing {', 3) 714 self.checkPatternError(r'\N{', 'missing character name', 3) 715 self.checkPatternError(r'[\N{', 'missing character name', 4) 716 self.checkPatternError(r'\N{}', 'missing character name', 3) 717 self.checkPatternError(r'[\N{}]', 'missing character name', 4) 718 self.checkPatternError(r'\NSNAKE}', 'missing {', 2) 719 self.checkPatternError(r'[\NSNAKE}]', 'missing {', 3) 720 self.checkPatternError(r'\N{SNAKE', 721 'missing }, unterminated name', 3) 722 self.checkPatternError(r'[\N{SNAKE]', 723 'missing }, unterminated name', 4) 724 self.checkPatternError(r'[\N{SNAKE]}', 725 "undefined character name 'SNAKE]'", 1) 726 self.checkPatternError(r'\N{SPAM}', 727 "undefined character name 'SPAM'", 0) 728 self.checkPatternError(r'[\N{SPAM}]', 729 "undefined character name 'SPAM'", 1) 730 self.checkPatternError(br'\N{LESS-THAN SIGN}', r'bad escape \N', 0) 731 self.checkPatternError(br'[\N{LESS-THAN SIGN}]', r'bad escape \N', 1) 732 733 def test_string_boundaries(self): 734 # See http://bugs.python.org/issue10713 735 self.assertEqual(re.search(r"\b(abc)\b", "abc").group(1), 736 "abc") 737 # There's a word boundary at the start of a string. 738 self.assertTrue(re.match(r"\b", "abc")) 739 # A non-empty string includes a non-boundary zero-length match. 740 self.assertTrue(re.search(r"\B", "abc")) 741 # There is no non-boundary match at the start of a string. 742 self.assertFalse(re.match(r"\B", "abc")) 743 # However, an empty string contains no word boundaries, and also no 744 # non-boundaries. 745 self.assertIsNone(re.search(r"\B", "")) 746 # This one is questionable and different from the perlre behaviour, 747 # but describes current behavior. 748 self.assertIsNone(re.search(r"\b", "")) 749 # A single word-character string has two boundaries, but no 750 # non-boundary gaps. 751 self.assertEqual(len(re.findall(r"\b", "a")), 2) 752 self.assertEqual(len(re.findall(r"\B", "a")), 0) 753 # If there are no words, there are no boundaries 754 self.assertEqual(len(re.findall(r"\b", " ")), 0) 755 self.assertEqual(len(re.findall(r"\b", " ")), 0) 756 # Can match around the whitespace. 757 self.assertEqual(len(re.findall(r"\B", " ")), 2) 758 759 def test_bigcharset(self): 760 self.assertEqual(re.match("([\u2222\u2223])", 761 "\u2222").group(1), "\u2222") 762 r = '[%s]' % ''.join(map(chr, range(256, 2**16, 255))) 763 self.assertEqual(re.match(r, "\uff01").group(), "\uff01") 764 765 def test_big_codesize(self): 766 # Issue #1160 767 r = re.compile('|'.join(('%d'%x for x in range(10000)))) 768 self.assertTrue(r.match('1000')) 769 self.assertTrue(r.match('9999')) 770 771 def test_anyall(self): 772 self.assertEqual(re.match("a.b", "a\nb", re.DOTALL).group(0), 773 "a\nb") 774 self.assertEqual(re.match("a.*b", "a\n\nb", re.DOTALL).group(0), 775 "a\n\nb") 776 777 def test_lookahead(self): 778 self.assertEqual(re.match(r"(a(?=\s[^a]))", "a b").group(1), "a") 779 self.assertEqual(re.match(r"(a(?=\s[^a]*))", "a b").group(1), "a") 780 self.assertEqual(re.match(r"(a(?=\s[abc]))", "a b").group(1), "a") 781 self.assertEqual(re.match(r"(a(?=\s[abc]*))", "a bc").group(1), "a") 782 self.assertEqual(re.match(r"(a)(?=\s\1)", "a a").group(1), "a") 783 self.assertEqual(re.match(r"(a)(?=\s\1*)", "a aa").group(1), "a") 784 self.assertEqual(re.match(r"(a)(?=\s(abc|a))", "a a").group(1), "a") 785 786 self.assertEqual(re.match(r"(a(?!\s[^a]))", "a a").group(1), "a") 787 self.assertEqual(re.match(r"(a(?!\s[abc]))", "a d").group(1), "a") 788 self.assertEqual(re.match(r"(a)(?!\s\1)", "a b").group(1), "a") 789 self.assertEqual(re.match(r"(a)(?!\s(abc|a))", "a b").group(1), "a") 790 791 # Group reference. 792 self.assertTrue(re.match(r'(a)b(?=\1)a', 'aba')) 793 self.assertIsNone(re.match(r'(a)b(?=\1)c', 'abac')) 794 # Conditional group reference. 795 self.assertTrue(re.match(r'(?:(a)|(x))b(?=(?(2)x|c))c', 'abc')) 796 self.assertIsNone(re.match(r'(?:(a)|(x))b(?=(?(2)c|x))c', 'abc')) 797 self.assertTrue(re.match(r'(?:(a)|(x))b(?=(?(2)x|c))c', 'abc')) 798 self.assertIsNone(re.match(r'(?:(a)|(x))b(?=(?(1)b|x))c', 'abc')) 799 self.assertTrue(re.match(r'(?:(a)|(x))b(?=(?(1)c|x))c', 'abc')) 800 # Group used before defined. 801 self.assertTrue(re.match(r'(a)b(?=(?(2)x|c))(c)', 'abc')) 802 self.assertIsNone(re.match(r'(a)b(?=(?(2)b|x))(c)', 'abc')) 803 self.assertTrue(re.match(r'(a)b(?=(?(1)c|x))(c)', 'abc')) 804 805 def test_lookbehind(self): 806 self.assertTrue(re.match(r'ab(?<=b)c', 'abc')) 807 self.assertIsNone(re.match(r'ab(?<=c)c', 'abc')) 808 self.assertIsNone(re.match(r'ab(?<!b)c', 'abc')) 809 self.assertTrue(re.match(r'ab(?<!c)c', 'abc')) 810 # Group reference. 811 self.assertTrue(re.match(r'(a)a(?<=\1)c', 'aac')) 812 self.assertIsNone(re.match(r'(a)b(?<=\1)a', 'abaa')) 813 self.assertIsNone(re.match(r'(a)a(?<!\1)c', 'aac')) 814 self.assertTrue(re.match(r'(a)b(?<!\1)a', 'abaa')) 815 # Conditional group reference. 816 self.assertIsNone(re.match(r'(?:(a)|(x))b(?<=(?(2)x|c))c', 'abc')) 817 self.assertIsNone(re.match(r'(?:(a)|(x))b(?<=(?(2)b|x))c', 'abc')) 818 self.assertTrue(re.match(r'(?:(a)|(x))b(?<=(?(2)x|b))c', 'abc')) 819 self.assertIsNone(re.match(r'(?:(a)|(x))b(?<=(?(1)c|x))c', 'abc')) 820 self.assertTrue(re.match(r'(?:(a)|(x))b(?<=(?(1)b|x))c', 'abc')) 821 # Group used before defined. 822 self.assertRaises(re.error, re.compile, r'(a)b(?<=(?(2)b|x))(c)') 823 self.assertIsNone(re.match(r'(a)b(?<=(?(1)c|x))(c)', 'abc')) 824 self.assertTrue(re.match(r'(a)b(?<=(?(1)b|x))(c)', 'abc')) 825 # Group defined in the same lookbehind pattern 826 self.assertRaises(re.error, re.compile, r'(a)b(?<=(.)\2)(c)') 827 self.assertRaises(re.error, re.compile, r'(a)b(?<=(?P<a>.)(?P=a))(c)') 828 self.assertRaises(re.error, re.compile, r'(a)b(?<=(a)(?(2)b|x))(c)') 829 self.assertRaises(re.error, re.compile, r'(a)b(?<=(.)(?<=\2))(c)') 830 831 def test_ignore_case(self): 832 self.assertEqual(re.match("abc", "ABC", re.I).group(0), "ABC") 833 self.assertEqual(re.match(b"abc", b"ABC", re.I).group(0), b"ABC") 834 self.assertEqual(re.match(r"(a\s[^a])", "a b", re.I).group(1), "a b") 835 self.assertEqual(re.match(r"(a\s[^a]*)", "a bb", re.I).group(1), "a bb") 836 self.assertEqual(re.match(r"(a\s[abc])", "a b", re.I).group(1), "a b") 837 self.assertEqual(re.match(r"(a\s[abc]*)", "a bb", re.I).group(1), "a bb") 838 self.assertEqual(re.match(r"((a)\s\2)", "a a", re.I).group(1), "a a") 839 self.assertEqual(re.match(r"((a)\s\2*)", "a aa", re.I).group(1), "a aa") 840 self.assertEqual(re.match(r"((a)\s(abc|a))", "a a", re.I).group(1), "a a") 841 self.assertEqual(re.match(r"((a)\s(abc|a)*)", "a aa", re.I).group(1), "a aa") 842 843 assert '\u212a'.lower() == 'k' # 'K' 844 self.assertTrue(re.match(r'K', '\u212a', re.I)) 845 self.assertTrue(re.match(r'k', '\u212a', re.I)) 846 self.assertTrue(re.match(r'\u212a', 'K', re.I)) 847 self.assertTrue(re.match(r'\u212a', 'k', re.I)) 848 assert '\u017f'.upper() == 'S' # 'ſ' 849 self.assertTrue(re.match(r'S', '\u017f', re.I)) 850 self.assertTrue(re.match(r's', '\u017f', re.I)) 851 self.assertTrue(re.match(r'\u017f', 'S', re.I)) 852 self.assertTrue(re.match(r'\u017f', 's', re.I)) 853 assert '\ufb05'.upper() == '\ufb06'.upper() == 'ST' # 'ſt', 'st' 854 self.assertTrue(re.match(r'\ufb05', '\ufb06', re.I)) 855 self.assertTrue(re.match(r'\ufb06', '\ufb05', re.I)) 856 857 def test_ignore_case_set(self): 858 self.assertTrue(re.match(r'[19A]', 'A', re.I)) 859 self.assertTrue(re.match(r'[19a]', 'a', re.I)) 860 self.assertTrue(re.match(r'[19a]', 'A', re.I)) 861 self.assertTrue(re.match(r'[19A]', 'a', re.I)) 862 self.assertTrue(re.match(br'[19A]', b'A', re.I)) 863 self.assertTrue(re.match(br'[19a]', b'a', re.I)) 864 self.assertTrue(re.match(br'[19a]', b'A', re.I)) 865 self.assertTrue(re.match(br'[19A]', b'a', re.I)) 866 assert '\u212a'.lower() == 'k' # 'K' 867 self.assertTrue(re.match(r'[19K]', '\u212a', re.I)) 868 self.assertTrue(re.match(r'[19k]', '\u212a', re.I)) 869 self.assertTrue(re.match(r'[19\u212a]', 'K', re.I)) 870 self.assertTrue(re.match(r'[19\u212a]', 'k', re.I)) 871 assert '\u017f'.upper() == 'S' # 'ſ' 872 self.assertTrue(re.match(r'[19S]', '\u017f', re.I)) 873 self.assertTrue(re.match(r'[19s]', '\u017f', re.I)) 874 self.assertTrue(re.match(r'[19\u017f]', 'S', re.I)) 875 self.assertTrue(re.match(r'[19\u017f]', 's', re.I)) 876 assert '\ufb05'.upper() == '\ufb06'.upper() == 'ST' # 'ſt', 'st' 877 self.assertTrue(re.match(r'[19\ufb05]', '\ufb06', re.I)) 878 self.assertTrue(re.match(r'[19\ufb06]', '\ufb05', re.I)) 879 880 def test_ignore_case_range(self): 881 # Issues #3511, #17381. 882 self.assertTrue(re.match(r'[9-a]', '_', re.I)) 883 self.assertIsNone(re.match(r'[9-A]', '_', re.I)) 884 self.assertTrue(re.match(br'[9-a]', b'_', re.I)) 885 self.assertIsNone(re.match(br'[9-A]', b'_', re.I)) 886 self.assertTrue(re.match(r'[\xc0-\xde]', '\xd7', re.I)) 887 self.assertIsNone(re.match(r'[\xc0-\xde]', '\xf7', re.I)) 888 self.assertTrue(re.match(r'[\xe0-\xfe]', '\xf7', re.I)) 889 self.assertIsNone(re.match(r'[\xe0-\xfe]', '\xd7', re.I)) 890 self.assertTrue(re.match(r'[\u0430-\u045f]', '\u0450', re.I)) 891 self.assertTrue(re.match(r'[\u0430-\u045f]', '\u0400', re.I)) 892 self.assertTrue(re.match(r'[\u0400-\u042f]', '\u0450', re.I)) 893 self.assertTrue(re.match(r'[\u0400-\u042f]', '\u0400', re.I)) 894 self.assertTrue(re.match(r'[\U00010428-\U0001044f]', '\U00010428', re.I)) 895 self.assertTrue(re.match(r'[\U00010428-\U0001044f]', '\U00010400', re.I)) 896 self.assertTrue(re.match(r'[\U00010400-\U00010427]', '\U00010428', re.I)) 897 self.assertTrue(re.match(r'[\U00010400-\U00010427]', '\U00010400', re.I)) 898 899 assert '\u212a'.lower() == 'k' # 'K' 900 self.assertTrue(re.match(r'[J-M]', '\u212a', re.I)) 901 self.assertTrue(re.match(r'[j-m]', '\u212a', re.I)) 902 self.assertTrue(re.match(r'[\u2129-\u212b]', 'K', re.I)) 903 self.assertTrue(re.match(r'[\u2129-\u212b]', 'k', re.I)) 904 assert '\u017f'.upper() == 'S' # 'ſ' 905 self.assertTrue(re.match(r'[R-T]', '\u017f', re.I)) 906 self.assertTrue(re.match(r'[r-t]', '\u017f', re.I)) 907 self.assertTrue(re.match(r'[\u017e-\u0180]', 'S', re.I)) 908 self.assertTrue(re.match(r'[\u017e-\u0180]', 's', re.I)) 909 assert '\ufb05'.upper() == '\ufb06'.upper() == 'ST' # 'ſt', 'st' 910 self.assertTrue(re.match(r'[\ufb04-\ufb05]', '\ufb06', re.I)) 911 self.assertTrue(re.match(r'[\ufb06-\ufb07]', '\ufb05', re.I)) 912 913 def test_category(self): 914 self.assertEqual(re.match(r"(\s)", " ").group(1), " ") 915 916 @cpython_only 917 def test_case_helpers(self): 918 import _sre 919 for i in range(128): 920 c = chr(i) 921 lo = ord(c.lower()) 922 self.assertEqual(_sre.ascii_tolower(i), lo) 923 self.assertEqual(_sre.unicode_tolower(i), lo) 924 iscased = c in string.ascii_letters 925 self.assertEqual(_sre.ascii_iscased(i), iscased) 926 self.assertEqual(_sre.unicode_iscased(i), iscased) 927 928 for i in list(range(128, 0x1000)) + [0x10400, 0x10428]: 929 c = chr(i) 930 self.assertEqual(_sre.ascii_tolower(i), i) 931 if i != 0x0130: 932 self.assertEqual(_sre.unicode_tolower(i), ord(c.lower())) 933 iscased = c != c.lower() or c != c.upper() 934 self.assertFalse(_sre.ascii_iscased(i)) 935 self.assertEqual(_sre.unicode_iscased(i), 936 c != c.lower() or c != c.upper()) 937 938 self.assertEqual(_sre.ascii_tolower(0x0130), 0x0130) 939 self.assertEqual(_sre.unicode_tolower(0x0130), ord('i')) 940 self.assertFalse(_sre.ascii_iscased(0x0130)) 941 self.assertTrue(_sre.unicode_iscased(0x0130)) 942 943 def test_not_literal(self): 944 self.assertEqual(re.search(r"\s([^a])", " b").group(1), "b") 945 self.assertEqual(re.search(r"\s([^a]*)", " bb").group(1), "bb") 946 947 def test_possible_set_operations(self): 948 s = bytes(range(128)).decode() 949 with self.assertWarns(FutureWarning): 950 p = re.compile(r'[0-9--1]') 951 self.assertEqual(p.findall(s), list('-./0123456789')) 952 self.assertEqual(re.findall(r'[--1]', s), list('-./01')) 953 with self.assertWarns(FutureWarning): 954 p = re.compile(r'[%--1]') 955 self.assertEqual(p.findall(s), list("%&'()*+,-1")) 956 with self.assertWarns(FutureWarning): 957 p = re.compile(r'[%--]') 958 self.assertEqual(p.findall(s), list("%&'()*+,-")) 959 960 with self.assertWarns(FutureWarning): 961 p = re.compile(r'[0-9&&1]') 962 self.assertEqual(p.findall(s), list('&0123456789')) 963 with self.assertWarns(FutureWarning): 964 p = re.compile(r'[\d&&1]') 965 self.assertEqual(p.findall(s), list('&0123456789')) 966 self.assertEqual(re.findall(r'[&&1]', s), list('&1')) 967 968 with self.assertWarns(FutureWarning): 969 p = re.compile(r'[0-9||a]') 970 self.assertEqual(p.findall(s), list('0123456789a|')) 971 with self.assertWarns(FutureWarning): 972 p = re.compile(r'[\d||a]') 973 self.assertEqual(p.findall(s), list('0123456789a|')) 974 self.assertEqual(re.findall(r'[||1]', s), list('1|')) 975 976 with self.assertWarns(FutureWarning): 977 p = re.compile(r'[0-9~~1]') 978 self.assertEqual(p.findall(s), list('0123456789~')) 979 with self.assertWarns(FutureWarning): 980 p = re.compile(r'[\d~~1]') 981 self.assertEqual(p.findall(s), list('0123456789~')) 982 self.assertEqual(re.findall(r'[~~1]', s), list('1~')) 983 984 with self.assertWarns(FutureWarning): 985 p = re.compile(r'[[0-9]|]') 986 self.assertEqual(p.findall(s), list('0123456789[]')) 987 988 with self.assertWarns(FutureWarning): 989 p = re.compile(r'[[:digit:]|]') 990 self.assertEqual(p.findall(s), list(':[]dgit')) 991 992 def test_search_coverage(self): 993 self.assertEqual(re.search(r"\s(b)", " b").group(1), "b") 994 self.assertEqual(re.search(r"a\s", "a ").group(0), "a ") 995 996 def assertMatch(self, pattern, text, match=None, span=None, 997 matcher=re.fullmatch): 998 if match is None and span is None: 999 # the pattern matches the whole text 1000 match = text 1001 span = (0, len(text)) 1002 elif match is None or span is None: 1003 raise ValueError('If match is not None, span should be specified ' 1004 '(and vice versa).') 1005 m = matcher(pattern, text) 1006 self.assertTrue(m) 1007 self.assertEqual(m.group(), match) 1008 self.assertEqual(m.span(), span) 1009 1010 LITERAL_CHARS = string.ascii_letters + string.digits + '!"%\',/:;<=>@_`' 1011 1012 def test_re_escape(self): 1013 p = ''.join(chr(i) for i in range(256)) 1014 for c in p: 1015 self.assertMatch(re.escape(c), c) 1016 self.assertMatch('[' + re.escape(c) + ']', c) 1017 self.assertMatch('(?x)' + re.escape(c), c) 1018 self.assertMatch(re.escape(p), p) 1019 for c in '-.]{}': 1020 self.assertEqual(re.escape(c)[:1], '\\') 1021 literal_chars = self.LITERAL_CHARS 1022 self.assertEqual(re.escape(literal_chars), literal_chars) 1023 1024 def test_re_escape_bytes(self): 1025 p = bytes(range(256)) 1026 for i in p: 1027 b = bytes([i]) 1028 self.assertMatch(re.escape(b), b) 1029 self.assertMatch(b'[' + re.escape(b) + b']', b) 1030 self.assertMatch(b'(?x)' + re.escape(b), b) 1031 self.assertMatch(re.escape(p), p) 1032 for i in b'-.]{}': 1033 b = bytes([i]) 1034 self.assertEqual(re.escape(b)[:1], b'\\') 1035 literal_chars = self.LITERAL_CHARS.encode('ascii') 1036 self.assertEqual(re.escape(literal_chars), literal_chars) 1037 1038 def test_re_escape_non_ascii(self): 1039 s = 'xxx\u2620\u2620\u2620xxx' 1040 s_escaped = re.escape(s) 1041 self.assertEqual(s_escaped, s) 1042 self.assertMatch(s_escaped, s) 1043 self.assertMatch('.%s+.' % re.escape('\u2620'), s, 1044 'x\u2620\u2620\u2620x', (2, 7), re.search) 1045 1046 def test_re_escape_non_ascii_bytes(self): 1047 b = 'y\u2620y\u2620y'.encode('utf-8') 1048 b_escaped = re.escape(b) 1049 self.assertEqual(b_escaped, b) 1050 self.assertMatch(b_escaped, b) 1051 res = re.findall(re.escape('\u2620'.encode('utf-8')), b) 1052 self.assertEqual(len(res), 2) 1053 1054 def test_pickling(self): 1055 import pickle 1056 oldpat = re.compile('a(?:b|(c|e){1,2}?|d)+?(.)', re.UNICODE) 1057 for proto in range(pickle.HIGHEST_PROTOCOL + 1): 1058 pickled = pickle.dumps(oldpat, proto) 1059 newpat = pickle.loads(pickled) 1060 self.assertEqual(newpat, oldpat) 1061 # current pickle expects the _compile() reconstructor in re module 1062 from re import _compile 1063 1064 def test_copying(self): 1065 import copy 1066 p = re.compile(r'(?P<int>\d+)(?:\.(?P<frac>\d*))?') 1067 self.assertIs(copy.copy(p), p) 1068 self.assertIs(copy.deepcopy(p), p) 1069 m = p.match('12.34') 1070 self.assertIs(copy.copy(m), m) 1071 self.assertIs(copy.deepcopy(m), m) 1072 1073 def test_constants(self): 1074 self.assertEqual(re.I, re.IGNORECASE) 1075 self.assertEqual(re.L, re.LOCALE) 1076 self.assertEqual(re.M, re.MULTILINE) 1077 self.assertEqual(re.S, re.DOTALL) 1078 self.assertEqual(re.X, re.VERBOSE) 1079 1080 def test_flags(self): 1081 for flag in [re.I, re.M, re.X, re.S, re.A, re.U]: 1082 self.assertTrue(re.compile('^pattern$', flag)) 1083 for flag in [re.I, re.M, re.X, re.S, re.A, re.L]: 1084 self.assertTrue(re.compile(b'^pattern$', flag)) 1085 1086 def test_sre_character_literals(self): 1087 for i in [0, 8, 16, 32, 64, 127, 128, 255, 256, 0xFFFF, 0x10000, 0x10FFFF]: 1088 if i < 256: 1089 self.assertTrue(re.match(r"\%03o" % i, chr(i))) 1090 self.assertTrue(re.match(r"\%03o0" % i, chr(i)+"0")) 1091 self.assertTrue(re.match(r"\%03o8" % i, chr(i)+"8")) 1092 self.assertTrue(re.match(r"\x%02x" % i, chr(i))) 1093 self.assertTrue(re.match(r"\x%02x0" % i, chr(i)+"0")) 1094 self.assertTrue(re.match(r"\x%02xz" % i, chr(i)+"z")) 1095 if i < 0x10000: 1096 self.assertTrue(re.match(r"\u%04x" % i, chr(i))) 1097 self.assertTrue(re.match(r"\u%04x0" % i, chr(i)+"0")) 1098 self.assertTrue(re.match(r"\u%04xz" % i, chr(i)+"z")) 1099 self.assertTrue(re.match(r"\U%08x" % i, chr(i))) 1100 self.assertTrue(re.match(r"\U%08x0" % i, chr(i)+"0")) 1101 self.assertTrue(re.match(r"\U%08xz" % i, chr(i)+"z")) 1102 self.assertTrue(re.match(r"\0", "\000")) 1103 self.assertTrue(re.match(r"\08", "\0008")) 1104 self.assertTrue(re.match(r"\01", "\001")) 1105 self.assertTrue(re.match(r"\018", "\0018")) 1106 self.checkPatternError(r"\567", 1107 r'octal escape value \567 outside of ' 1108 r'range 0-0o377', 0) 1109 self.checkPatternError(r"\911", 'invalid group reference 91', 1) 1110 self.checkPatternError(r"\x1", r'incomplete escape \x1', 0) 1111 self.checkPatternError(r"\x1z", r'incomplete escape \x1', 0) 1112 self.checkPatternError(r"\u123", r'incomplete escape \u123', 0) 1113 self.checkPatternError(r"\u123z", r'incomplete escape \u123', 0) 1114 self.checkPatternError(r"\U0001234", r'incomplete escape \U0001234', 0) 1115 self.checkPatternError(r"\U0001234z", r'incomplete escape \U0001234', 0) 1116 self.checkPatternError(r"\U00110000", r'bad escape \U00110000', 0) 1117 1118 def test_sre_character_class_literals(self): 1119 for i in [0, 8, 16, 32, 64, 127, 128, 255, 256, 0xFFFF, 0x10000, 0x10FFFF]: 1120 if i < 256: 1121 self.assertTrue(re.match(r"[\%o]" % i, chr(i))) 1122 self.assertTrue(re.match(r"[\%o8]" % i, chr(i))) 1123 self.assertTrue(re.match(r"[\%03o]" % i, chr(i))) 1124 self.assertTrue(re.match(r"[\%03o0]" % i, chr(i))) 1125 self.assertTrue(re.match(r"[\%03o8]" % i, chr(i))) 1126 self.assertTrue(re.match(r"[\x%02x]" % i, chr(i))) 1127 self.assertTrue(re.match(r"[\x%02x0]" % i, chr(i))) 1128 self.assertTrue(re.match(r"[\x%02xz]" % i, chr(i))) 1129 if i < 0x10000: 1130 self.assertTrue(re.match(r"[\u%04x]" % i, chr(i))) 1131 self.assertTrue(re.match(r"[\u%04x0]" % i, chr(i))) 1132 self.assertTrue(re.match(r"[\u%04xz]" % i, chr(i))) 1133 self.assertTrue(re.match(r"[\U%08x]" % i, chr(i))) 1134 self.assertTrue(re.match(r"[\U%08x0]" % i, chr(i)+"0")) 1135 self.assertTrue(re.match(r"[\U%08xz]" % i, chr(i)+"z")) 1136 self.checkPatternError(r"[\567]", 1137 r'octal escape value \567 outside of ' 1138 r'range 0-0o377', 1) 1139 self.checkPatternError(r"[\911]", r'bad escape \9', 1) 1140 self.checkPatternError(r"[\x1z]", r'incomplete escape \x1', 1) 1141 self.checkPatternError(r"[\u123z]", r'incomplete escape \u123', 1) 1142 self.checkPatternError(r"[\U0001234z]", r'incomplete escape \U0001234', 1) 1143 self.checkPatternError(r"[\U00110000]", r'bad escape \U00110000', 1) 1144 self.assertTrue(re.match(r"[\U0001d49c-\U0001d4b5]", "\U0001d49e")) 1145 1146 def test_sre_byte_literals(self): 1147 for i in [0, 8, 16, 32, 64, 127, 128, 255]: 1148 self.assertTrue(re.match((r"\%03o" % i).encode(), bytes([i]))) 1149 self.assertTrue(re.match((r"\%03o0" % i).encode(), bytes([i])+b"0")) 1150 self.assertTrue(re.match((r"\%03o8" % i).encode(), bytes([i])+b"8")) 1151 self.assertTrue(re.match((r"\x%02x" % i).encode(), bytes([i]))) 1152 self.assertTrue(re.match((r"\x%02x0" % i).encode(), bytes([i])+b"0")) 1153 self.assertTrue(re.match((r"\x%02xz" % i).encode(), bytes([i])+b"z")) 1154 self.assertRaises(re.error, re.compile, br"\u1234") 1155 self.assertRaises(re.error, re.compile, br"\U00012345") 1156 self.assertTrue(re.match(br"\0", b"\000")) 1157 self.assertTrue(re.match(br"\08", b"\0008")) 1158 self.assertTrue(re.match(br"\01", b"\001")) 1159 self.assertTrue(re.match(br"\018", b"\0018")) 1160 self.checkPatternError(br"\567", 1161 r'octal escape value \567 outside of ' 1162 r'range 0-0o377', 0) 1163 self.checkPatternError(br"\911", 'invalid group reference 91', 1) 1164 self.checkPatternError(br"\x1", r'incomplete escape \x1', 0) 1165 self.checkPatternError(br"\x1z", r'incomplete escape \x1', 0) 1166 1167 def test_sre_byte_class_literals(self): 1168 for i in [0, 8, 16, 32, 64, 127, 128, 255]: 1169 self.assertTrue(re.match((r"[\%o]" % i).encode(), bytes([i]))) 1170 self.assertTrue(re.match((r"[\%o8]" % i).encode(), bytes([i]))) 1171 self.assertTrue(re.match((r"[\%03o]" % i).encode(), bytes([i]))) 1172 self.assertTrue(re.match((r"[\%03o0]" % i).encode(), bytes([i]))) 1173 self.assertTrue(re.match((r"[\%03o8]" % i).encode(), bytes([i]))) 1174 self.assertTrue(re.match((r"[\x%02x]" % i).encode(), bytes([i]))) 1175 self.assertTrue(re.match((r"[\x%02x0]" % i).encode(), bytes([i]))) 1176 self.assertTrue(re.match((r"[\x%02xz]" % i).encode(), bytes([i]))) 1177 self.assertRaises(re.error, re.compile, br"[\u1234]") 1178 self.assertRaises(re.error, re.compile, br"[\U00012345]") 1179 self.checkPatternError(br"[\567]", 1180 r'octal escape value \567 outside of ' 1181 r'range 0-0o377', 1) 1182 self.checkPatternError(br"[\911]", r'bad escape \9', 1) 1183 self.checkPatternError(br"[\x1z]", r'incomplete escape \x1', 1) 1184 1185 def test_character_set_errors(self): 1186 self.checkPatternError(r'[', 'unterminated character set', 0) 1187 self.checkPatternError(r'[^', 'unterminated character set', 0) 1188 self.checkPatternError(r'[a', 'unterminated character set', 0) 1189 # bug 545855 -- This pattern failed to cause a compile error as it 1190 # should, instead provoking a TypeError. 1191 self.checkPatternError(r"[a-", 'unterminated character set', 0) 1192 self.checkPatternError(r"[\w-b]", r'bad character range \w-b', 1) 1193 self.checkPatternError(r"[a-\w]", r'bad character range a-\w', 1) 1194 self.checkPatternError(r"[b-a]", 'bad character range b-a', 1) 1195 1196 def test_bug_113254(self): 1197 self.assertEqual(re.match(r'(a)|(b)', 'b').start(1), -1) 1198 self.assertEqual(re.match(r'(a)|(b)', 'b').end(1), -1) 1199 self.assertEqual(re.match(r'(a)|(b)', 'b').span(1), (-1, -1)) 1200 1201 def test_bug_527371(self): 1202 # bug described in patches 527371/672491 1203 self.assertIsNone(re.match(r'(a)?a','a').lastindex) 1204 self.assertEqual(re.match(r'(a)(b)?b','ab').lastindex, 1) 1205 self.assertEqual(re.match(r'(?P<a>a)(?P<b>b)?b','ab').lastgroup, 'a') 1206 self.assertEqual(re.match(r"(?P<a>a(b))", "ab").lastgroup, 'a') 1207 self.assertEqual(re.match(r"((a))", "a").lastindex, 1) 1208 1209 def test_bug_418626(self): 1210 # bugs 418626 at al. -- Testing Greg Chapman's addition of op code 1211 # SRE_OP_MIN_REPEAT_ONE for eliminating recursion on simple uses of 1212 # pattern '*?' on a long string. 1213 self.assertEqual(re.match('.*?c', 10000*'ab'+'cd').end(0), 20001) 1214 self.assertEqual(re.match('.*?cd', 5000*'ab'+'c'+5000*'ab'+'cde').end(0), 1215 20003) 1216 self.assertEqual(re.match('.*?cd', 20000*'abc'+'de').end(0), 60001) 1217 # non-simple '*?' still used to hit the recursion limit, before the 1218 # non-recursive scheme was implemented. 1219 self.assertEqual(re.search('(a|b)*?c', 10000*'ab'+'cd').end(0), 20001) 1220 1221 def test_bug_612074(self): 1222 pat="["+re.escape("\u2039")+"]" 1223 self.assertEqual(re.compile(pat) and 1, 1) 1224 1225 def test_stack_overflow(self): 1226 # nasty cases that used to overflow the straightforward recursive 1227 # implementation of repeated groups. 1228 self.assertEqual(re.match('(x)*', 50000*'x').group(1), 'x') 1229 self.assertEqual(re.match('(x)*y', 50000*'x'+'y').group(1), 'x') 1230 self.assertEqual(re.match('(x)*?y', 50000*'x'+'y').group(1), 'x') 1231 1232 def test_nothing_to_repeat(self): 1233 for reps in '*', '+', '?', '{1,2}': 1234 for mod in '', '?': 1235 self.checkPatternError('%s%s' % (reps, mod), 1236 'nothing to repeat', 0) 1237 self.checkPatternError('(?:%s%s)' % (reps, mod), 1238 'nothing to repeat', 3) 1239 1240 def test_multiple_repeat(self): 1241 for outer_reps in '*', '+', '{1,2}': 1242 for outer_mod in '', '?': 1243 outer_op = outer_reps + outer_mod 1244 for inner_reps in '*', '+', '?', '{1,2}': 1245 for inner_mod in '', '?': 1246 inner_op = inner_reps + inner_mod 1247 self.checkPatternError(r'x%s%s' % (inner_op, outer_op), 1248 'multiple repeat', 1 + len(inner_op)) 1249 1250 def test_unlimited_zero_width_repeat(self): 1251 # Issue #9669 1252 self.assertIsNone(re.match(r'(?:a?)*y', 'z')) 1253 self.assertIsNone(re.match(r'(?:a?)+y', 'z')) 1254 self.assertIsNone(re.match(r'(?:a?){2,}y', 'z')) 1255 self.assertIsNone(re.match(r'(?:a?)*?y', 'z')) 1256 self.assertIsNone(re.match(r'(?:a?)+?y', 'z')) 1257 self.assertIsNone(re.match(r'(?:a?){2,}?y', 'z')) 1258 1259 def test_scanner(self): 1260 def s_ident(scanner, token): return token 1261 def s_operator(scanner, token): return "op%s" % token 1262 def s_float(scanner, token): return float(token) 1263 def s_int(scanner, token): return int(token) 1264 1265 scanner = Scanner([ 1266 (r"[a-zA-Z_]\w*", s_ident), 1267 (r"\d+\.\d*", s_float), 1268 (r"\d+", s_int), 1269 (r"=|\+|-|\*|/", s_operator), 1270 (r"\s+", None), 1271 ]) 1272 1273 self.assertTrue(scanner.scanner.scanner("").pattern) 1274 1275 self.assertEqual(scanner.scan("sum = 3*foo + 312.50 + bar"), 1276 (['sum', 'op=', 3, 'op*', 'foo', 'op+', 312.5, 1277 'op+', 'bar'], '')) 1278 1279 def test_bug_448951(self): 1280 # bug 448951 (similar to 429357, but with single char match) 1281 # (Also test greedy matches.) 1282 for op in '','?','*': 1283 self.assertEqual(re.match(r'((.%s):)?z'%op, 'z').groups(), 1284 (None, None)) 1285 self.assertEqual(re.match(r'((.%s):)?z'%op, 'a:z').groups(), 1286 ('a:', 'a')) 1287 1288 def test_bug_725106(self): 1289 # capturing groups in alternatives in repeats 1290 self.assertEqual(re.match('^((a)|b)*', 'abc').groups(), 1291 ('b', 'a')) 1292 self.assertEqual(re.match('^(([ab])|c)*', 'abc').groups(), 1293 ('c', 'b')) 1294 self.assertEqual(re.match('^((d)|[ab])*', 'abc').groups(), 1295 ('b', None)) 1296 self.assertEqual(re.match('^((a)c|[ab])*', 'abc').groups(), 1297 ('b', None)) 1298 self.assertEqual(re.match('^((a)|b)*?c', 'abc').groups(), 1299 ('b', 'a')) 1300 self.assertEqual(re.match('^(([ab])|c)*?d', 'abcd').groups(), 1301 ('c', 'b')) 1302 self.assertEqual(re.match('^((d)|[ab])*?c', 'abc').groups(), 1303 ('b', None)) 1304 self.assertEqual(re.match('^((a)c|[ab])*?c', 'abc').groups(), 1305 ('b', None)) 1306 1307 def test_bug_725149(self): 1308 # mark_stack_base restoring before restoring marks 1309 self.assertEqual(re.match('(a)(?:(?=(b)*)c)*', 'abb').groups(), 1310 ('a', None)) 1311 self.assertEqual(re.match('(a)((?!(b)*))*', 'abb').groups(), 1312 ('a', None, None)) 1313 1314 def test_bug_764548(self): 1315 # bug 764548, re.compile() barfs on str/unicode subclasses 1316 class my_unicode(str): pass 1317 pat = re.compile(my_unicode("abc")) 1318 self.assertIsNone(pat.match("xyz")) 1319 1320 def test_finditer(self): 1321 iter = re.finditer(r":+", "a:b::c:::d") 1322 self.assertEqual([item.group(0) for item in iter], 1323 [":", "::", ":::"]) 1324 1325 pat = re.compile(r":+") 1326 iter = pat.finditer("a:b::c:::d", 1, 10) 1327 self.assertEqual([item.group(0) for item in iter], 1328 [":", "::", ":::"]) 1329 1330 pat = re.compile(r":+") 1331 iter = pat.finditer("a:b::c:::d", pos=1, endpos=10) 1332 self.assertEqual([item.group(0) for item in iter], 1333 [":", "::", ":::"]) 1334 1335 pat = re.compile(r":+") 1336 iter = pat.finditer("a:b::c:::d", endpos=10, pos=1) 1337 self.assertEqual([item.group(0) for item in iter], 1338 [":", "::", ":::"]) 1339 1340 pat = re.compile(r":+") 1341 iter = pat.finditer("a:b::c:::d", pos=3, endpos=8) 1342 self.assertEqual([item.group(0) for item in iter], 1343 ["::", "::"]) 1344 1345 def test_bug_926075(self): 1346 self.assertIsNot(re.compile('bug_926075'), 1347 re.compile(b'bug_926075')) 1348 1349 def test_bug_931848(self): 1350 pattern = "[\u002E\u3002\uFF0E\uFF61]" 1351 self.assertEqual(re.compile(pattern).split("a.b.c"), 1352 ['a','b','c']) 1353 1354 def test_bug_581080(self): 1355 iter = re.finditer(r"\s", "a b") 1356 self.assertEqual(next(iter).span(), (1,2)) 1357 self.assertRaises(StopIteration, next, iter) 1358 1359 scanner = re.compile(r"\s").scanner("a b") 1360 self.assertEqual(scanner.search().span(), (1, 2)) 1361 self.assertIsNone(scanner.search()) 1362 1363 def test_bug_817234(self): 1364 iter = re.finditer(r".*", "asdf") 1365 self.assertEqual(next(iter).span(), (0, 4)) 1366 self.assertEqual(next(iter).span(), (4, 4)) 1367 self.assertRaises(StopIteration, next, iter) 1368 1369 def test_bug_6561(self): 1370 # '\d' should match characters in Unicode category 'Nd' 1371 # (Number, Decimal Digit), but not those in 'Nl' (Number, 1372 # Letter) or 'No' (Number, Other). 1373 decimal_digits = [ 1374 '\u0037', # '\N{DIGIT SEVEN}', category 'Nd' 1375 '\u0e58', # '\N{THAI DIGIT SIX}', category 'Nd' 1376 '\uff10', # '\N{FULLWIDTH DIGIT ZERO}', category 'Nd' 1377 ] 1378 for x in decimal_digits: 1379 self.assertEqual(re.match(r'^\d$', x).group(0), x) 1380 1381 not_decimal_digits = [ 1382 '\u2165', # '\N{ROMAN NUMERAL SIX}', category 'Nl' 1383 '\u3039', # '\N{HANGZHOU NUMERAL TWENTY}', category 'Nl' 1384 '\u2082', # '\N{SUBSCRIPT TWO}', category 'No' 1385 '\u32b4', # '\N{CIRCLED NUMBER THIRTY NINE}', category 'No' 1386 ] 1387 for x in not_decimal_digits: 1388 self.assertIsNone(re.match(r'^\d$', x)) 1389 1390 def test_empty_array(self): 1391 # SF buf 1647541 1392 import array 1393 for typecode in 'bBuhHiIlLfd': 1394 a = array.array(typecode) 1395 self.assertIsNone(re.compile(b"bla").match(a)) 1396 self.assertEqual(re.compile(b"").match(a).groups(), ()) 1397 1398 def test_inline_flags(self): 1399 # Bug #1700 1400 upper_char = '\u1ea0' # Latin Capital Letter A with Dot Below 1401 lower_char = '\u1ea1' # Latin Small Letter A with Dot Below 1402 1403 p = re.compile('.' + upper_char, re.I | re.S) 1404 q = p.match('\n' + lower_char) 1405 self.assertTrue(q) 1406 1407 p = re.compile('.' + lower_char, re.I | re.S) 1408 q = p.match('\n' + upper_char) 1409 self.assertTrue(q) 1410 1411 p = re.compile('(?i).' + upper_char, re.S) 1412 q = p.match('\n' + lower_char) 1413 self.assertTrue(q) 1414 1415 p = re.compile('(?i).' + lower_char, re.S) 1416 q = p.match('\n' + upper_char) 1417 self.assertTrue(q) 1418 1419 p = re.compile('(?is).' + upper_char) 1420 q = p.match('\n' + lower_char) 1421 self.assertTrue(q) 1422 1423 p = re.compile('(?is).' + lower_char) 1424 q = p.match('\n' + upper_char) 1425 self.assertTrue(q) 1426 1427 p = re.compile('(?s)(?i).' + upper_char) 1428 q = p.match('\n' + lower_char) 1429 self.assertTrue(q) 1430 1431 p = re.compile('(?s)(?i).' + lower_char) 1432 q = p.match('\n' + upper_char) 1433 self.assertTrue(q) 1434 1435 self.assertTrue(re.match('(?ix) ' + upper_char, lower_char)) 1436 self.assertTrue(re.match('(?ix) ' + lower_char, upper_char)) 1437 self.assertTrue(re.match(' (?i) ' + upper_char, lower_char, re.X)) 1438 self.assertTrue(re.match('(?x) (?i) ' + upper_char, lower_char)) 1439 self.assertTrue(re.match(' (?x) (?i) ' + upper_char, lower_char, re.X)) 1440 1441 p = upper_char + '(?i)' 1442 with self.assertWarns(DeprecationWarning) as warns: 1443 self.assertTrue(re.match(p, lower_char)) 1444 self.assertEqual( 1445 str(warns.warnings[0].message), 1446 'Flags not at the start of the expression %r' % p 1447 ) 1448 self.assertEqual(warns.warnings[0].filename, __file__) 1449 1450 p = upper_char + '(?i)%s' % ('.?' * 100) 1451 with self.assertWarns(DeprecationWarning) as warns: 1452 self.assertTrue(re.match(p, lower_char)) 1453 self.assertEqual( 1454 str(warns.warnings[0].message), 1455 'Flags not at the start of the expression %r (truncated)' % p[:20] 1456 ) 1457 self.assertEqual(warns.warnings[0].filename, __file__) 1458 1459 # bpo-30605: Compiling a bytes instance regex was throwing a BytesWarning 1460 with warnings.catch_warnings(): 1461 warnings.simplefilter('error', BytesWarning) 1462 p = b'A(?i)' 1463 with self.assertWarns(DeprecationWarning) as warns: 1464 self.assertTrue(re.match(p, b'a')) 1465 self.assertEqual( 1466 str(warns.warnings[0].message), 1467 'Flags not at the start of the expression %r' % p 1468 ) 1469 self.assertEqual(warns.warnings[0].filename, __file__) 1470 1471 with self.assertWarns(DeprecationWarning): 1472 self.assertTrue(re.match('(?s).(?i)' + upper_char, '\n' + lower_char)) 1473 with self.assertWarns(DeprecationWarning): 1474 self.assertTrue(re.match('(?i) ' + upper_char + ' (?x)', lower_char)) 1475 with self.assertWarns(DeprecationWarning): 1476 self.assertTrue(re.match(' (?x) (?i) ' + upper_char, lower_char)) 1477 with self.assertWarns(DeprecationWarning): 1478 self.assertTrue(re.match('^(?i)' + upper_char, lower_char)) 1479 with self.assertWarns(DeprecationWarning): 1480 self.assertTrue(re.match('$|(?i)' + upper_char, lower_char)) 1481 with self.assertWarns(DeprecationWarning) as warns: 1482 self.assertTrue(re.match('(?:(?i)' + upper_char + ')', lower_char)) 1483 self.assertRegex(str(warns.warnings[0].message), 1484 'Flags not at the start') 1485 self.assertEqual(warns.warnings[0].filename, __file__) 1486 with self.assertWarns(DeprecationWarning) as warns: 1487 self.assertTrue(re.fullmatch('(^)?(?(1)(?i)' + upper_char + ')', 1488 lower_char)) 1489 self.assertRegex(str(warns.warnings[0].message), 1490 'Flags not at the start') 1491 self.assertEqual(warns.warnings[0].filename, __file__) 1492 with self.assertWarns(DeprecationWarning) as warns: 1493 self.assertTrue(re.fullmatch('($)?(?(1)|(?i)' + upper_char + ')', 1494 lower_char)) 1495 self.assertRegex(str(warns.warnings[0].message), 1496 'Flags not at the start') 1497 self.assertEqual(warns.warnings[0].filename, __file__) 1498 1499 1500 def test_dollar_matches_twice(self): 1501 "$ matches the end of string, and just before the terminating \n" 1502 pattern = re.compile('$') 1503 self.assertEqual(pattern.sub('#', 'a\nb\n'), 'a\nb#\n#') 1504 self.assertEqual(pattern.sub('#', 'a\nb\nc'), 'a\nb\nc#') 1505 self.assertEqual(pattern.sub('#', '\n'), '#\n#') 1506 1507 pattern = re.compile('$', re.MULTILINE) 1508 self.assertEqual(pattern.sub('#', 'a\nb\n' ), 'a#\nb#\n#' ) 1509 self.assertEqual(pattern.sub('#', 'a\nb\nc'), 'a#\nb#\nc#') 1510 self.assertEqual(pattern.sub('#', '\n'), '#\n#') 1511 1512 def test_bytes_str_mixing(self): 1513 # Mixing str and bytes is disallowed 1514 pat = re.compile('.') 1515 bpat = re.compile(b'.') 1516 self.assertRaises(TypeError, pat.match, b'b') 1517 self.assertRaises(TypeError, bpat.match, 'b') 1518 self.assertRaises(TypeError, pat.sub, b'b', 'c') 1519 self.assertRaises(TypeError, pat.sub, 'b', b'c') 1520 self.assertRaises(TypeError, pat.sub, b'b', b'c') 1521 self.assertRaises(TypeError, bpat.sub, b'b', 'c') 1522 self.assertRaises(TypeError, bpat.sub, 'b', b'c') 1523 self.assertRaises(TypeError, bpat.sub, 'b', 'c') 1524 1525 def test_ascii_and_unicode_flag(self): 1526 # String patterns 1527 for flags in (0, re.UNICODE): 1528 pat = re.compile('\xc0', flags | re.IGNORECASE) 1529 self.assertTrue(pat.match('\xe0')) 1530 pat = re.compile(r'\w', flags) 1531 self.assertTrue(pat.match('\xe0')) 1532 pat = re.compile('\xc0', re.ASCII | re.IGNORECASE) 1533 self.assertIsNone(pat.match('\xe0')) 1534 pat = re.compile('(?a)\xc0', re.IGNORECASE) 1535 self.assertIsNone(pat.match('\xe0')) 1536 pat = re.compile(r'\w', re.ASCII) 1537 self.assertIsNone(pat.match('\xe0')) 1538 pat = re.compile(r'(?a)\w') 1539 self.assertIsNone(pat.match('\xe0')) 1540 # Bytes patterns 1541 for flags in (0, re.ASCII): 1542 pat = re.compile(b'\xc0', flags | re.IGNORECASE) 1543 self.assertIsNone(pat.match(b'\xe0')) 1544 pat = re.compile(br'\w', flags) 1545 self.assertIsNone(pat.match(b'\xe0')) 1546 # Incompatibilities 1547 self.assertRaises(ValueError, re.compile, br'\w', re.UNICODE) 1548 self.assertRaises(re.error, re.compile, br'(?u)\w') 1549 self.assertRaises(ValueError, re.compile, r'\w', re.UNICODE | re.ASCII) 1550 self.assertRaises(ValueError, re.compile, r'(?u)\w', re.ASCII) 1551 self.assertRaises(ValueError, re.compile, r'(?a)\w', re.UNICODE) 1552 self.assertRaises(re.error, re.compile, r'(?au)\w') 1553 1554 def test_locale_flag(self): 1555 enc = locale.getpreferredencoding() 1556 # Search non-ASCII letter 1557 for i in range(128, 256): 1558 try: 1559 c = bytes([i]).decode(enc) 1560 sletter = c.lower() 1561 if sletter == c: continue 1562 bletter = sletter.encode(enc) 1563 if len(bletter) != 1: continue 1564 if bletter.decode(enc) != sletter: continue 1565 bpat = re.escape(bytes([i])) 1566 break 1567 except (UnicodeError, TypeError): 1568 pass 1569 else: 1570 bletter = None 1571 bpat = b'A' 1572 # Bytes patterns 1573 pat = re.compile(bpat, re.LOCALE | re.IGNORECASE) 1574 if bletter: 1575 self.assertTrue(pat.match(bletter)) 1576 pat = re.compile(b'(?L)' + bpat, re.IGNORECASE) 1577 if bletter: 1578 self.assertTrue(pat.match(bletter)) 1579 pat = re.compile(bpat, re.IGNORECASE) 1580 if bletter: 1581 self.assertIsNone(pat.match(bletter)) 1582 pat = re.compile(br'\w', re.LOCALE) 1583 if bletter: 1584 self.assertTrue(pat.match(bletter)) 1585 pat = re.compile(br'(?L)\w') 1586 if bletter: 1587 self.assertTrue(pat.match(bletter)) 1588 pat = re.compile(br'\w') 1589 if bletter: 1590 self.assertIsNone(pat.match(bletter)) 1591 # Incompatibilities 1592 self.assertRaises(ValueError, re.compile, '', re.LOCALE) 1593 self.assertRaises(re.error, re.compile, '(?L)') 1594 self.assertRaises(ValueError, re.compile, b'', re.LOCALE | re.ASCII) 1595 self.assertRaises(ValueError, re.compile, b'(?L)', re.ASCII) 1596 self.assertRaises(ValueError, re.compile, b'(?a)', re.LOCALE) 1597 self.assertRaises(re.error, re.compile, b'(?aL)') 1598 1599 def test_scoped_flags(self): 1600 self.assertTrue(re.match(r'(?i:a)b', 'Ab')) 1601 self.assertIsNone(re.match(r'(?i:a)b', 'aB')) 1602 self.assertIsNone(re.match(r'(?-i:a)b', 'Ab', re.IGNORECASE)) 1603 self.assertTrue(re.match(r'(?-i:a)b', 'aB', re.IGNORECASE)) 1604 self.assertIsNone(re.match(r'(?i:(?-i:a)b)', 'Ab')) 1605 self.assertTrue(re.match(r'(?i:(?-i:a)b)', 'aB')) 1606 1607 self.assertTrue(re.match(r'(?x: a) b', 'a b')) 1608 self.assertIsNone(re.match(r'(?x: a) b', ' a b')) 1609 self.assertTrue(re.match(r'(?-x: a) b', ' ab', re.VERBOSE)) 1610 self.assertIsNone(re.match(r'(?-x: a) b', 'ab', re.VERBOSE)) 1611 1612 self.assertTrue(re.match(r'\w(?a:\W)\w', '\xe0\xe0\xe0')) 1613 self.assertTrue(re.match(r'(?a:\W(?u:\w)\W)', '\xe0\xe0\xe0')) 1614 self.assertTrue(re.match(r'\W(?u:\w)\W', '\xe0\xe0\xe0', re.ASCII)) 1615 1616 self.checkPatternError(r'(?a)(?-a:\w)', 1617 "bad inline flags: cannot turn off flags 'a', 'u' and 'L'", 8) 1618 self.checkPatternError(r'(?i-i:a)', 1619 'bad inline flags: flag turned on and off', 5) 1620 self.checkPatternError(r'(?au:a)', 1621 "bad inline flags: flags 'a', 'u' and 'L' are incompatible", 4) 1622 self.checkPatternError(br'(?aL:a)', 1623 "bad inline flags: flags 'a', 'u' and 'L' are incompatible", 4) 1624 1625 self.checkPatternError(r'(?-', 'missing flag', 3) 1626 self.checkPatternError(r'(?-+', 'missing flag', 3) 1627 self.checkPatternError(r'(?-z', 'unknown flag', 3) 1628 self.checkPatternError(r'(?-i', 'missing :', 4) 1629 self.checkPatternError(r'(?-i)', 'missing :', 4) 1630 self.checkPatternError(r'(?-i+', 'missing :', 4) 1631 self.checkPatternError(r'(?-iz', 'unknown flag', 4) 1632 self.checkPatternError(r'(?i:', 'missing ), unterminated subpattern', 0) 1633 self.checkPatternError(r'(?i', 'missing -, : or )', 3) 1634 self.checkPatternError(r'(?i+', 'missing -, : or )', 3) 1635 self.checkPatternError(r'(?iz', 'unknown flag', 3) 1636 1637 def test_bug_6509(self): 1638 # Replacement strings of both types must parse properly. 1639 # all strings 1640 pat = re.compile(r'a(\w)') 1641 self.assertEqual(pat.sub('b\\1', 'ac'), 'bc') 1642 pat = re.compile('a(.)') 1643 self.assertEqual(pat.sub('b\\1', 'a\u1234'), 'b\u1234') 1644 pat = re.compile('..') 1645 self.assertEqual(pat.sub(lambda m: 'str', 'a5'), 'str') 1646 1647 # all bytes 1648 pat = re.compile(br'a(\w)') 1649 self.assertEqual(pat.sub(b'b\\1', b'ac'), b'bc') 1650 pat = re.compile(b'a(.)') 1651 self.assertEqual(pat.sub(b'b\\1', b'a\xCD'), b'b\xCD') 1652 pat = re.compile(b'..') 1653 self.assertEqual(pat.sub(lambda m: b'bytes', b'a5'), b'bytes') 1654 1655 def test_dealloc(self): 1656 # issue 3299: check for segfault in debug build 1657 import _sre 1658 # the overflow limit is different on wide and narrow builds and it 1659 # depends on the definition of SRE_CODE (see sre.h). 1660 # 2**128 should be big enough to overflow on both. For smaller values 1661 # a RuntimeError is raised instead of OverflowError. 1662 long_overflow = 2**128 1663 self.assertRaises(TypeError, re.finditer, "a", {}) 1664 with self.assertRaises(OverflowError): 1665 _sre.compile("abc", 0, [long_overflow], 0, {}, ()) 1666 with self.assertRaises(TypeError): 1667 _sre.compile({}, 0, [], 0, [], []) 1668 1669 def test_search_dot_unicode(self): 1670 self.assertTrue(re.search("123.*-", '123abc-')) 1671 self.assertTrue(re.search("123.*-", '123\xe9-')) 1672 self.assertTrue(re.search("123.*-", '123\u20ac-')) 1673 self.assertTrue(re.search("123.*-", '123\U0010ffff-')) 1674 self.assertTrue(re.search("123.*-", '123\xe9\u20ac\U0010ffff-')) 1675 1676 def test_compile(self): 1677 # Test return value when given string and pattern as parameter 1678 pattern = re.compile('random pattern') 1679 self.assertIsInstance(pattern, re.Pattern) 1680 same_pattern = re.compile(pattern) 1681 self.assertIsInstance(same_pattern, re.Pattern) 1682 self.assertIs(same_pattern, pattern) 1683 # Test behaviour when not given a string or pattern as parameter 1684 self.assertRaises(TypeError, re.compile, 0) 1685 1686 @bigmemtest(size=_2G, memuse=1) 1687 def test_large_search(self, size): 1688 # Issue #10182: indices were 32-bit-truncated. 1689 s = 'a' * size 1690 m = re.search('$', s) 1691 self.assertIsNotNone(m) 1692 self.assertEqual(m.start(), size) 1693 self.assertEqual(m.end(), size) 1694 1695 # The huge memuse is because of re.sub() using a list and a join() 1696 # to create the replacement result. 1697 @bigmemtest(size=_2G, memuse=16 + 2) 1698 def test_large_subn(self, size): 1699 # Issue #10182: indices were 32-bit-truncated. 1700 s = 'a' * size 1701 r, n = re.subn('', '', s) 1702 self.assertEqual(r, s) 1703 self.assertEqual(n, size + 1) 1704 1705 def test_bug_16688(self): 1706 # Issue 16688: Backreferences make case-insensitive regex fail on 1707 # non-ASCII strings. 1708 self.assertEqual(re.findall(r"(?i)(a)\1", "aa \u0100"), ['a']) 1709 self.assertEqual(re.match(r"(?s).{1,3}", "\u0100\u0100").span(), (0, 2)) 1710 1711 def test_repeat_minmax_overflow(self): 1712 # Issue #13169 1713 string = "x" * 100000 1714 self.assertEqual(re.match(r".{65535}", string).span(), (0, 65535)) 1715 self.assertEqual(re.match(r".{,65535}", string).span(), (0, 65535)) 1716 self.assertEqual(re.match(r".{65535,}?", string).span(), (0, 65535)) 1717 self.assertEqual(re.match(r".{65536}", string).span(), (0, 65536)) 1718 self.assertEqual(re.match(r".{,65536}", string).span(), (0, 65536)) 1719 self.assertEqual(re.match(r".{65536,}?", string).span(), (0, 65536)) 1720 # 2**128 should be big enough to overflow both SRE_CODE and Py_ssize_t. 1721 self.assertRaises(OverflowError, re.compile, r".{%d}" % 2**128) 1722 self.assertRaises(OverflowError, re.compile, r".{,%d}" % 2**128) 1723 self.assertRaises(OverflowError, re.compile, r".{%d,}?" % 2**128) 1724 self.assertRaises(OverflowError, re.compile, r".{%d,%d}" % (2**129, 2**128)) 1725 1726 @cpython_only 1727 def test_repeat_minmax_overflow_maxrepeat(self): 1728 try: 1729 from _sre import MAXREPEAT 1730 except ImportError: 1731 self.skipTest('requires _sre.MAXREPEAT constant') 1732 string = "x" * 100000 1733 self.assertIsNone(re.match(r".{%d}" % (MAXREPEAT - 1), string)) 1734 self.assertEqual(re.match(r".{,%d}" % (MAXREPEAT - 1), string).span(), 1735 (0, 100000)) 1736 self.assertIsNone(re.match(r".{%d,}?" % (MAXREPEAT - 1), string)) 1737 self.assertRaises(OverflowError, re.compile, r".{%d}" % MAXREPEAT) 1738 self.assertRaises(OverflowError, re.compile, r".{,%d}" % MAXREPEAT) 1739 self.assertRaises(OverflowError, re.compile, r".{%d,}?" % MAXREPEAT) 1740 1741 def test_backref_group_name_in_exception(self): 1742 # Issue 17341: Poor error message when compiling invalid regex 1743 self.checkPatternError('(?P=<foo>)', 1744 "bad character in group name '<foo>'", 4) 1745 1746 def test_group_name_in_exception(self): 1747 # Issue 17341: Poor error message when compiling invalid regex 1748 self.checkPatternError('(?P<?foo>)', 1749 "bad character in group name '?foo'", 4) 1750 1751 def test_issue17998(self): 1752 for reps in '*', '+', '?', '{1}': 1753 for mod in '', '?': 1754 pattern = '.' + reps + mod + 'yz' 1755 self.assertEqual(re.compile(pattern, re.S).findall('xyz'), 1756 ['xyz'], msg=pattern) 1757 pattern = pattern.encode() 1758 self.assertEqual(re.compile(pattern, re.S).findall(b'xyz'), 1759 [b'xyz'], msg=pattern) 1760 1761 def test_match_repr(self): 1762 for string in '[abracadabra]', S('[abracadabra]'): 1763 m = re.search(r'(.+)(.*?)\1', string) 1764 pattern = r"<(%s\.)?%s object; span=\(1, 12\), match='abracadabra'>" % ( 1765 type(m).__module__, type(m).__qualname__ 1766 ) 1767 self.assertRegex(repr(m), pattern) 1768 for string in (b'[abracadabra]', B(b'[abracadabra]'), 1769 bytearray(b'[abracadabra]'), 1770 memoryview(b'[abracadabra]')): 1771 m = re.search(br'(.+)(.*?)\1', string) 1772 pattern = r"<(%s\.)?%s object; span=\(1, 12\), match=b'abracadabra'>" % ( 1773 type(m).__module__, type(m).__qualname__ 1774 ) 1775 self.assertRegex(repr(m), pattern) 1776 1777 first, second = list(re.finditer("(aa)|(bb)", "aa bb")) 1778 pattern = r"<(%s\.)?%s object; span=\(0, 2\), match='aa'>" % ( 1779 type(second).__module__, type(second).__qualname__ 1780 ) 1781 self.assertRegex(repr(first), pattern) 1782 pattern = r"<(%s\.)?%s object; span=\(3, 5\), match='bb'>" % ( 1783 type(second).__module__, type(second).__qualname__ 1784 ) 1785 self.assertRegex(repr(second), pattern) 1786 1787 def test_zerowidth(self): 1788 # Issues 852532, 1647489, 3262, 25054. 1789 self.assertEqual(re.split(r"\b", "a::bc"), ['', 'a', '::', 'bc', '']) 1790 self.assertEqual(re.split(r"\b|:+", "a::bc"), ['', 'a', '', '', 'bc', '']) 1791 self.assertEqual(re.split(r"(?<!\w)(?=\w)|:+", "a::bc"), ['', 'a', '', 'bc']) 1792 self.assertEqual(re.split(r"(?<=\w)(?!\w)|:+", "a::bc"), ['a', '', 'bc', '']) 1793 1794 self.assertEqual(re.sub(r"\b", "-", "a::bc"), '-a-::-bc-') 1795 self.assertEqual(re.sub(r"\b|:+", "-", "a::bc"), '-a---bc-') 1796 self.assertEqual(re.sub(r"(\b|:+)", r"[\1]", "a::bc"), '[]a[][::][]bc[]') 1797 1798 self.assertEqual(re.findall(r"\b|:+", "a::bc"), ['', '', '::', '', '']) 1799 self.assertEqual(re.findall(r"\b|\w+", "a::bc"), 1800 ['', 'a', '', '', 'bc', '']) 1801 1802 self.assertEqual([m.span() for m in re.finditer(r"\b|:+", "a::bc")], 1803 [(0, 0), (1, 1), (1, 3), (3, 3), (5, 5)]) 1804 self.assertEqual([m.span() for m in re.finditer(r"\b|\w+", "a::bc")], 1805 [(0, 0), (0, 1), (1, 1), (3, 3), (3, 5), (5, 5)]) 1806 1807 def test_bug_2537(self): 1808 # issue 2537: empty submatches 1809 for outer_op in ('{0,}', '*', '+', '{1,187}'): 1810 for inner_op in ('{0,}', '*', '?'): 1811 r = re.compile("^((x|y)%s)%s" % (inner_op, outer_op)) 1812 m = r.match("xyyzy") 1813 self.assertEqual(m.group(0), "xyy") 1814 self.assertEqual(m.group(1), "") 1815 self.assertEqual(m.group(2), "y") 1816 1817 @cpython_only 1818 def test_debug_flag(self): 1819 pat = r'(\.)(?:[ch]|py)(?(1)$|: )' 1820 with captured_stdout() as out: 1821 re.compile(pat, re.DEBUG) 1822 self.maxDiff = None 1823 dump = '''\ 1824SUBPATTERN 1 0 0 1825 LITERAL 46 1826BRANCH 1827 IN 1828 LITERAL 99 1829 LITERAL 104 1830OR 1831 LITERAL 112 1832 LITERAL 121 1833GROUPREF_EXISTS 1 1834 AT AT_END 1835ELSE 1836 LITERAL 58 1837 LITERAL 32 1838 1839 0. INFO 8 0b1 2 5 (to 9) 1840 prefix_skip 0 1841 prefix [0x2e] ('.') 1842 overlap [0] 1843 9: MARK 0 184411. LITERAL 0x2e ('.') 184513. MARK 1 184615. BRANCH 10 (to 26) 184717. IN 6 (to 24) 184819. LITERAL 0x63 ('c') 184921. LITERAL 0x68 ('h') 185023. FAILURE 185124: JUMP 9 (to 34) 185226: branch 7 (to 33) 185327. LITERAL 0x70 ('p') 185429. LITERAL 0x79 ('y') 185531. JUMP 2 (to 34) 185633: FAILURE 185734: GROUPREF_EXISTS 0 6 (to 41) 185837. AT END 185939. JUMP 5 (to 45) 186041: LITERAL 0x3a (':') 186143. LITERAL 0x20 (' ') 186245: SUCCESS 1863''' 1864 self.assertEqual(out.getvalue(), dump) 1865 # Debug output is output again even a second time (bypassing 1866 # the cache -- issue #20426). 1867 with captured_stdout() as out: 1868 re.compile(pat, re.DEBUG) 1869 self.assertEqual(out.getvalue(), dump) 1870 1871 def test_keyword_parameters(self): 1872 # Issue #20283: Accepting the string keyword parameter. 1873 pat = re.compile(r'(ab)') 1874 self.assertEqual( 1875 pat.match(string='abracadabra', pos=7, endpos=10).span(), (7, 9)) 1876 self.assertEqual( 1877 pat.fullmatch(string='abracadabra', pos=7, endpos=9).span(), (7, 9)) 1878 self.assertEqual( 1879 pat.search(string='abracadabra', pos=3, endpos=10).span(), (7, 9)) 1880 self.assertEqual( 1881 pat.findall(string='abracadabra', pos=3, endpos=10), ['ab']) 1882 self.assertEqual( 1883 pat.split(string='abracadabra', maxsplit=1), 1884 ['', 'ab', 'racadabra']) 1885 self.assertEqual( 1886 pat.scanner(string='abracadabra', pos=3, endpos=10).search().span(), 1887 (7, 9)) 1888 1889 def test_bug_20998(self): 1890 # Issue #20998: Fullmatch of repeated single character pattern 1891 # with ignore case. 1892 self.assertEqual(re.fullmatch('[a-c]+', 'ABC', re.I).span(), (0, 3)) 1893 1894 def test_locale_caching(self): 1895 # Issue #22410 1896 oldlocale = locale.setlocale(locale.LC_CTYPE) 1897 self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale) 1898 for loc in 'en_US.iso88591', 'en_US.utf8': 1899 try: 1900 locale.setlocale(locale.LC_CTYPE, loc) 1901 except locale.Error: 1902 # Unsupported locale on this system 1903 self.skipTest('test needs %s locale' % loc) 1904 1905 re.purge() 1906 self.check_en_US_iso88591() 1907 self.check_en_US_utf8() 1908 re.purge() 1909 self.check_en_US_utf8() 1910 self.check_en_US_iso88591() 1911 1912 def check_en_US_iso88591(self): 1913 locale.setlocale(locale.LC_CTYPE, 'en_US.iso88591') 1914 self.assertTrue(re.match(b'\xc5\xe5', b'\xc5\xe5', re.L|re.I)) 1915 self.assertTrue(re.match(b'\xc5', b'\xe5', re.L|re.I)) 1916 self.assertTrue(re.match(b'\xe5', b'\xc5', re.L|re.I)) 1917 self.assertTrue(re.match(b'(?Li)\xc5\xe5', b'\xc5\xe5')) 1918 self.assertTrue(re.match(b'(?Li)\xc5', b'\xe5')) 1919 self.assertTrue(re.match(b'(?Li)\xe5', b'\xc5')) 1920 1921 def check_en_US_utf8(self): 1922 locale.setlocale(locale.LC_CTYPE, 'en_US.utf8') 1923 self.assertTrue(re.match(b'\xc5\xe5', b'\xc5\xe5', re.L|re.I)) 1924 self.assertIsNone(re.match(b'\xc5', b'\xe5', re.L|re.I)) 1925 self.assertIsNone(re.match(b'\xe5', b'\xc5', re.L|re.I)) 1926 self.assertTrue(re.match(b'(?Li)\xc5\xe5', b'\xc5\xe5')) 1927 self.assertIsNone(re.match(b'(?Li)\xc5', b'\xe5')) 1928 self.assertIsNone(re.match(b'(?Li)\xe5', b'\xc5')) 1929 1930 def test_locale_compiled(self): 1931 oldlocale = locale.setlocale(locale.LC_CTYPE) 1932 self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale) 1933 for loc in 'en_US.iso88591', 'en_US.utf8': 1934 try: 1935 locale.setlocale(locale.LC_CTYPE, loc) 1936 except locale.Error: 1937 # Unsupported locale on this system 1938 self.skipTest('test needs %s locale' % loc) 1939 1940 locale.setlocale(locale.LC_CTYPE, 'en_US.iso88591') 1941 p1 = re.compile(b'\xc5\xe5', re.L|re.I) 1942 p2 = re.compile(b'[a\xc5][a\xe5]', re.L|re.I) 1943 p3 = re.compile(b'[az\xc5][az\xe5]', re.L|re.I) 1944 p4 = re.compile(b'[^\xc5][^\xe5]', re.L|re.I) 1945 for p in p1, p2, p3: 1946 self.assertTrue(p.match(b'\xc5\xe5')) 1947 self.assertTrue(p.match(b'\xe5\xe5')) 1948 self.assertTrue(p.match(b'\xc5\xc5')) 1949 self.assertIsNone(p4.match(b'\xe5\xc5')) 1950 self.assertIsNone(p4.match(b'\xe5\xe5')) 1951 self.assertIsNone(p4.match(b'\xc5\xc5')) 1952 1953 locale.setlocale(locale.LC_CTYPE, 'en_US.utf8') 1954 for p in p1, p2, p3: 1955 self.assertTrue(p.match(b'\xc5\xe5')) 1956 self.assertIsNone(p.match(b'\xe5\xe5')) 1957 self.assertIsNone(p.match(b'\xc5\xc5')) 1958 self.assertTrue(p4.match(b'\xe5\xc5')) 1959 self.assertIsNone(p4.match(b'\xe5\xe5')) 1960 self.assertIsNone(p4.match(b'\xc5\xc5')) 1961 1962 def test_error(self): 1963 with self.assertRaises(re.error) as cm: 1964 re.compile('(\u20ac))') 1965 err = cm.exception 1966 self.assertIsInstance(err.pattern, str) 1967 self.assertEqual(err.pattern, '(\u20ac))') 1968 self.assertEqual(err.pos, 3) 1969 self.assertEqual(err.lineno, 1) 1970 self.assertEqual(err.colno, 4) 1971 self.assertIn(err.msg, str(err)) 1972 self.assertIn(' at position 3', str(err)) 1973 self.assertNotIn(' at position 3', err.msg) 1974 # Bytes pattern 1975 with self.assertRaises(re.error) as cm: 1976 re.compile(b'(\xa4))') 1977 err = cm.exception 1978 self.assertIsInstance(err.pattern, bytes) 1979 self.assertEqual(err.pattern, b'(\xa4))') 1980 self.assertEqual(err.pos, 3) 1981 # Multiline pattern 1982 with self.assertRaises(re.error) as cm: 1983 re.compile(""" 1984 ( 1985 abc 1986 ) 1987 ) 1988 ( 1989 """, re.VERBOSE) 1990 err = cm.exception 1991 self.assertEqual(err.pos, 77) 1992 self.assertEqual(err.lineno, 5) 1993 self.assertEqual(err.colno, 17) 1994 self.assertIn(err.msg, str(err)) 1995 self.assertIn(' at position 77', str(err)) 1996 self.assertIn('(line 5, column 17)', str(err)) 1997 1998 def test_misc_errors(self): 1999 self.checkPatternError(r'(', 'missing ), unterminated subpattern', 0) 2000 self.checkPatternError(r'((a|b)', 'missing ), unterminated subpattern', 0) 2001 self.checkPatternError(r'(a|b))', 'unbalanced parenthesis', 5) 2002 self.checkPatternError(r'(?P', 'unexpected end of pattern', 3) 2003 self.checkPatternError(r'(?z)', 'unknown extension ?z', 1) 2004 self.checkPatternError(r'(?iz)', 'unknown flag', 3) 2005 self.checkPatternError(r'(?i', 'missing -, : or )', 3) 2006 self.checkPatternError(r'(?#abc', 'missing ), unterminated comment', 0) 2007 self.checkPatternError(r'(?<', 'unexpected end of pattern', 3) 2008 self.checkPatternError(r'(?<>)', 'unknown extension ?<>', 1) 2009 self.checkPatternError(r'(?', 'unexpected end of pattern', 2) 2010 2011 def test_enum(self): 2012 # Issue #28082: Check that str(flag) returns a human readable string 2013 # instead of an integer 2014 self.assertIn('ASCII', str(re.A)) 2015 self.assertIn('DOTALL', str(re.S)) 2016 2017 def test_pattern_compare(self): 2018 pattern1 = re.compile('abc', re.IGNORECASE) 2019 2020 # equal to itself 2021 self.assertEqual(pattern1, pattern1) 2022 self.assertFalse(pattern1 != pattern1) 2023 2024 # equal 2025 re.purge() 2026 pattern2 = re.compile('abc', re.IGNORECASE) 2027 self.assertEqual(hash(pattern2), hash(pattern1)) 2028 self.assertEqual(pattern2, pattern1) 2029 2030 # not equal: different pattern 2031 re.purge() 2032 pattern3 = re.compile('XYZ', re.IGNORECASE) 2033 # Don't test hash(pattern3) != hash(pattern1) because there is no 2034 # warranty that hash values are different 2035 self.assertNotEqual(pattern3, pattern1) 2036 2037 # not equal: different flag (flags=0) 2038 re.purge() 2039 pattern4 = re.compile('abc') 2040 self.assertNotEqual(pattern4, pattern1) 2041 2042 # only == and != comparison operators are supported 2043 with self.assertRaises(TypeError): 2044 pattern1 < pattern2 2045 2046 def test_pattern_compare_bytes(self): 2047 pattern1 = re.compile(b'abc') 2048 2049 # equal: test bytes patterns 2050 re.purge() 2051 pattern2 = re.compile(b'abc') 2052 self.assertEqual(hash(pattern2), hash(pattern1)) 2053 self.assertEqual(pattern2, pattern1) 2054 2055 # not equal: pattern of a different types (str vs bytes), 2056 # comparison must not raise a BytesWarning 2057 re.purge() 2058 pattern3 = re.compile('abc') 2059 with warnings.catch_warnings(): 2060 warnings.simplefilter('error', BytesWarning) 2061 self.assertNotEqual(pattern3, pattern1) 2062 2063 def test_bug_29444(self): 2064 s = bytearray(b'abcdefgh') 2065 m = re.search(b'[a-h]+', s) 2066 m2 = re.search(b'[e-h]+', s) 2067 self.assertEqual(m.group(), b'abcdefgh') 2068 self.assertEqual(m2.group(), b'efgh') 2069 s[:] = b'xyz' 2070 self.assertEqual(m.group(), b'xyz') 2071 self.assertEqual(m2.group(), b'') 2072 2073 def test_bug_34294(self): 2074 # Issue 34294: wrong capturing groups 2075 2076 # exists since Python 2 2077 s = "a\tx" 2078 p = r"\b(?=(\t)|(x))x" 2079 self.assertEqual(re.search(p, s).groups(), (None, 'x')) 2080 2081 # introduced in Python 3.7.0 2082 s = "ab" 2083 p = r"(?=(.)(.)?)" 2084 self.assertEqual(re.findall(p, s), 2085 [('a', 'b'), ('b', '')]) 2086 self.assertEqual([m.groups() for m in re.finditer(p, s)], 2087 [('a', 'b'), ('b', None)]) 2088 2089 # test-cases provided by issue34294, introduced in Python 3.7.0 2090 p = r"(?=<(?P<tag>\w+)/?>(?:(?P<text>.+?)</(?P=tag)>)?)" 2091 s = "<test><foo2/></test>" 2092 self.assertEqual(re.findall(p, s), 2093 [('test', '<foo2/>'), ('foo2', '')]) 2094 self.assertEqual([m.groupdict() for m in re.finditer(p, s)], 2095 [{'tag': 'test', 'text': '<foo2/>'}, 2096 {'tag': 'foo2', 'text': None}]) 2097 s = "<test>Hello</test><foo/>" 2098 self.assertEqual([m.groupdict() for m in re.finditer(p, s)], 2099 [{'tag': 'test', 'text': 'Hello'}, 2100 {'tag': 'foo', 'text': None}]) 2101 s = "<test>Hello</test><foo/><foo/>" 2102 self.assertEqual([m.groupdict() for m in re.finditer(p, s)], 2103 [{'tag': 'test', 'text': 'Hello'}, 2104 {'tag': 'foo', 'text': None}, 2105 {'tag': 'foo', 'text': None}]) 2106 2107 2108class PatternReprTests(unittest.TestCase): 2109 def check(self, pattern, expected): 2110 self.assertEqual(repr(re.compile(pattern)), expected) 2111 2112 def check_flags(self, pattern, flags, expected): 2113 self.assertEqual(repr(re.compile(pattern, flags)), expected) 2114 2115 def test_without_flags(self): 2116 self.check('random pattern', 2117 "re.compile('random pattern')") 2118 2119 def test_single_flag(self): 2120 self.check_flags('random pattern', re.IGNORECASE, 2121 "re.compile('random pattern', re.IGNORECASE)") 2122 2123 def test_multiple_flags(self): 2124 self.check_flags('random pattern', re.I|re.S|re.X, 2125 "re.compile('random pattern', " 2126 "re.IGNORECASE|re.DOTALL|re.VERBOSE)") 2127 2128 def test_unicode_flag(self): 2129 self.check_flags('random pattern', re.U, 2130 "re.compile('random pattern')") 2131 self.check_flags('random pattern', re.I|re.S|re.U, 2132 "re.compile('random pattern', " 2133 "re.IGNORECASE|re.DOTALL)") 2134 2135 def test_inline_flags(self): 2136 self.check('(?i)pattern', 2137 "re.compile('(?i)pattern', re.IGNORECASE)") 2138 2139 def test_unknown_flags(self): 2140 self.check_flags('random pattern', 0x123000, 2141 "re.compile('random pattern', 0x123000)") 2142 self.check_flags('random pattern', 0x123000|re.I, 2143 "re.compile('random pattern', re.IGNORECASE|0x123000)") 2144 2145 def test_bytes(self): 2146 self.check(b'bytes pattern', 2147 "re.compile(b'bytes pattern')") 2148 self.check_flags(b'bytes pattern', re.A, 2149 "re.compile(b'bytes pattern', re.ASCII)") 2150 2151 def test_locale(self): 2152 self.check_flags(b'bytes pattern', re.L, 2153 "re.compile(b'bytes pattern', re.LOCALE)") 2154 2155 def test_quotes(self): 2156 self.check('random "double quoted" pattern', 2157 '''re.compile('random "double quoted" pattern')''') 2158 self.check("random 'single quoted' pattern", 2159 '''re.compile("random 'single quoted' pattern")''') 2160 self.check('''both 'single' and "double" quotes''', 2161 '''re.compile('both \\'single\\' and "double" quotes')''') 2162 2163 def test_long_pattern(self): 2164 pattern = 'Very %spattern' % ('long ' * 1000) 2165 r = repr(re.compile(pattern)) 2166 self.assertLess(len(r), 300) 2167 self.assertEqual(r[:30], "re.compile('Very long long lon") 2168 r = repr(re.compile(pattern, re.I)) 2169 self.assertLess(len(r), 300) 2170 self.assertEqual(r[:30], "re.compile('Very long long lon") 2171 self.assertEqual(r[-16:], ", re.IGNORECASE)") 2172 2173 def test_flags_repr(self): 2174 self.assertEqual(repr(re.I), "re.IGNORECASE") 2175 self.assertEqual(repr(re.I|re.S|re.X), 2176 "re.IGNORECASE|re.DOTALL|re.VERBOSE") 2177 self.assertEqual(repr(re.I|re.S|re.X|(1<<20)), 2178 "re.IGNORECASE|re.DOTALL|re.VERBOSE|0x100000") 2179 self.assertEqual(repr(~re.I), "~re.IGNORECASE") 2180 self.assertEqual(repr(~(re.I|re.S|re.X)), 2181 "~(re.IGNORECASE|re.DOTALL|re.VERBOSE)") 2182 self.assertEqual(repr(~(re.I|re.S|re.X|(1<<20))), 2183 "~(re.IGNORECASE|re.DOTALL|re.VERBOSE|0x100000)") 2184 2185 2186class ImplementationTest(unittest.TestCase): 2187 """ 2188 Test implementation details of the re module. 2189 """ 2190 2191 def test_overlap_table(self): 2192 f = sre_compile._generate_overlap_table 2193 self.assertEqual(f(""), []) 2194 self.assertEqual(f("a"), [0]) 2195 self.assertEqual(f("abcd"), [0, 0, 0, 0]) 2196 self.assertEqual(f("aaaa"), [0, 1, 2, 3]) 2197 self.assertEqual(f("ababba"), [0, 0, 1, 2, 0, 1]) 2198 self.assertEqual(f("abcabdac"), [0, 0, 0, 1, 2, 0, 1, 0]) 2199 2200 2201class ExternalTests(unittest.TestCase): 2202 2203 def test_re_benchmarks(self): 2204 're_tests benchmarks' 2205 from test.re_tests import benchmarks 2206 for pattern, s in benchmarks: 2207 with self.subTest(pattern=pattern, string=s): 2208 p = re.compile(pattern) 2209 self.assertTrue(p.search(s)) 2210 self.assertTrue(p.match(s)) 2211 self.assertTrue(p.fullmatch(s)) 2212 s2 = ' '*10000 + s + ' '*10000 2213 self.assertTrue(p.search(s2)) 2214 self.assertTrue(p.match(s2, 10000)) 2215 self.assertTrue(p.match(s2, 10000, 10000 + len(s))) 2216 self.assertTrue(p.fullmatch(s2, 10000, 10000 + len(s))) 2217 2218 def test_re_tests(self): 2219 're_tests test suite' 2220 from test.re_tests import tests, SUCCEED, FAIL, SYNTAX_ERROR 2221 for t in tests: 2222 pattern = s = outcome = repl = expected = None 2223 if len(t) == 5: 2224 pattern, s, outcome, repl, expected = t 2225 elif len(t) == 3: 2226 pattern, s, outcome = t 2227 else: 2228 raise ValueError('Test tuples should have 3 or 5 fields', t) 2229 2230 with self.subTest(pattern=pattern, string=s): 2231 if outcome == SYNTAX_ERROR: # Expected a syntax error 2232 with self.assertRaises(re.error): 2233 re.compile(pattern) 2234 continue 2235 2236 obj = re.compile(pattern) 2237 result = obj.search(s) 2238 if outcome == FAIL: 2239 self.assertIsNone(result, 'Succeeded incorrectly') 2240 continue 2241 2242 with self.subTest(): 2243 self.assertTrue(result, 'Failed incorrectly') 2244 # Matched, as expected, so now we compute the 2245 # result string and compare it to our expected result. 2246 start, end = result.span(0) 2247 vardict = {'found': result.group(0), 2248 'groups': result.group(), 2249 'flags': result.re.flags} 2250 for i in range(1, 100): 2251 try: 2252 gi = result.group(i) 2253 # Special hack because else the string concat fails: 2254 if gi is None: 2255 gi = "None" 2256 except IndexError: 2257 gi = "Error" 2258 vardict['g%d' % i] = gi 2259 for i in result.re.groupindex.keys(): 2260 try: 2261 gi = result.group(i) 2262 if gi is None: 2263 gi = "None" 2264 except IndexError: 2265 gi = "Error" 2266 vardict[i] = gi 2267 self.assertEqual(eval(repl, vardict), expected, 2268 'grouping error') 2269 2270 # Try the match with both pattern and string converted to 2271 # bytes, and check that it still succeeds. 2272 try: 2273 bpat = bytes(pattern, "ascii") 2274 bs = bytes(s, "ascii") 2275 except UnicodeEncodeError: 2276 # skip non-ascii tests 2277 pass 2278 else: 2279 with self.subTest('bytes pattern match'): 2280 obj = re.compile(bpat) 2281 self.assertTrue(obj.search(bs)) 2282 2283 # Try the match with LOCALE enabled, and check that it 2284 # still succeeds. 2285 with self.subTest('locale-sensitive match'): 2286 obj = re.compile(bpat, re.LOCALE) 2287 result = obj.search(bs) 2288 if result is None: 2289 print('=== Fails on locale-sensitive match', t) 2290 2291 # Try the match with the search area limited to the extent 2292 # of the match and see if it still succeeds. \B will 2293 # break (because it won't match at the end or start of a 2294 # string), so we'll ignore patterns that feature it. 2295 if (pattern[:2] != r'\B' and pattern[-2:] != r'\B' 2296 and result is not None): 2297 with self.subTest('range-limited match'): 2298 obj = re.compile(pattern) 2299 self.assertTrue(obj.search(s, start, end + 1)) 2300 2301 # Try the match with IGNORECASE enabled, and check that it 2302 # still succeeds. 2303 with self.subTest('case-insensitive match'): 2304 obj = re.compile(pattern, re.IGNORECASE) 2305 self.assertTrue(obj.search(s)) 2306 2307 # Try the match with UNICODE locale enabled, and check 2308 # that it still succeeds. 2309 with self.subTest('unicode-sensitive match'): 2310 obj = re.compile(pattern, re.UNICODE) 2311 self.assertTrue(obj.search(s)) 2312 2313 2314if __name__ == "__main__": 2315 unittest.main() 2316