1import difflib 2from test.support import run_unittest, findfile 3import unittest 4import doctest 5import sys 6 7 8class TestWithAscii(unittest.TestCase): 9 def test_one_insert(self): 10 sm = difflib.SequenceMatcher(None, 'b' * 100, 'a' + 'b' * 100) 11 self.assertAlmostEqual(sm.ratio(), 0.995, places=3) 12 self.assertEqual(list(sm.get_opcodes()), 13 [ ('insert', 0, 0, 0, 1), 14 ('equal', 0, 100, 1, 101)]) 15 self.assertEqual(sm.bpopular, set()) 16 sm = difflib.SequenceMatcher(None, 'b' * 100, 'b' * 50 + 'a' + 'b' * 50) 17 self.assertAlmostEqual(sm.ratio(), 0.995, places=3) 18 self.assertEqual(list(sm.get_opcodes()), 19 [ ('equal', 0, 50, 0, 50), 20 ('insert', 50, 50, 50, 51), 21 ('equal', 50, 100, 51, 101)]) 22 self.assertEqual(sm.bpopular, set()) 23 24 def test_one_delete(self): 25 sm = difflib.SequenceMatcher(None, 'a' * 40 + 'c' + 'b' * 40, 'a' * 40 + 'b' * 40) 26 self.assertAlmostEqual(sm.ratio(), 0.994, places=3) 27 self.assertEqual(list(sm.get_opcodes()), 28 [ ('equal', 0, 40, 0, 40), 29 ('delete', 40, 41, 40, 40), 30 ('equal', 41, 81, 40, 80)]) 31 32 def test_bjunk(self): 33 sm = difflib.SequenceMatcher(isjunk=lambda x: x == ' ', 34 a='a' * 40 + 'b' * 40, b='a' * 44 + 'b' * 40) 35 self.assertEqual(sm.bjunk, set()) 36 37 sm = difflib.SequenceMatcher(isjunk=lambda x: x == ' ', 38 a='a' * 40 + 'b' * 40, b='a' * 44 + 'b' * 40 + ' ' * 20) 39 self.assertEqual(sm.bjunk, {' '}) 40 41 sm = difflib.SequenceMatcher(isjunk=lambda x: x in [' ', 'b'], 42 a='a' * 40 + 'b' * 40, b='a' * 44 + 'b' * 40 + ' ' * 20) 43 self.assertEqual(sm.bjunk, {' ', 'b'}) 44 45 46class TestAutojunk(unittest.TestCase): 47 """Tests for the autojunk parameter added in 2.7""" 48 def test_one_insert_homogenous_sequence(self): 49 # By default autojunk=True and the heuristic kicks in for a sequence 50 # of length 200+ 51 seq1 = 'b' * 200 52 seq2 = 'a' + 'b' * 200 53 54 sm = difflib.SequenceMatcher(None, seq1, seq2) 55 self.assertAlmostEqual(sm.ratio(), 0, places=3) 56 self.assertEqual(sm.bpopular, {'b'}) 57 58 # Now turn the heuristic off 59 sm = difflib.SequenceMatcher(None, seq1, seq2, autojunk=False) 60 self.assertAlmostEqual(sm.ratio(), 0.9975, places=3) 61 self.assertEqual(sm.bpopular, set()) 62 63 64class TestSFbugs(unittest.TestCase): 65 def test_ratio_for_null_seqn(self): 66 # Check clearing of SF bug 763023 67 s = difflib.SequenceMatcher(None, [], []) 68 self.assertEqual(s.ratio(), 1) 69 self.assertEqual(s.quick_ratio(), 1) 70 self.assertEqual(s.real_quick_ratio(), 1) 71 72 def test_comparing_empty_lists(self): 73 # Check fix for bug #979794 74 group_gen = difflib.SequenceMatcher(None, [], []).get_grouped_opcodes() 75 self.assertRaises(StopIteration, next, group_gen) 76 diff_gen = difflib.unified_diff([], []) 77 self.assertRaises(StopIteration, next, diff_gen) 78 79 def test_matching_blocks_cache(self): 80 # Issue #21635 81 s = difflib.SequenceMatcher(None, "abxcd", "abcd") 82 first = s.get_matching_blocks() 83 second = s.get_matching_blocks() 84 self.assertEqual(second[0].size, 2) 85 self.assertEqual(second[1].size, 2) 86 self.assertEqual(second[2].size, 0) 87 88 def test_added_tab_hint(self): 89 # Check fix for bug #1488943 90 diff = list(difflib.Differ().compare(["\tI am a buggy"],["\t\tI am a bug"])) 91 self.assertEqual("- \tI am a buggy", diff[0]) 92 self.assertEqual("? \t --\n", diff[1]) 93 self.assertEqual("+ \t\tI am a bug", diff[2]) 94 self.assertEqual("? +\n", diff[3]) 95 96 def test_hint_indented_properly_with_tabs(self): 97 diff = list(difflib.Differ().compare(["\t \t \t^"], ["\t \t \t^\n"])) 98 self.assertEqual("- \t \t \t^", diff[0]) 99 self.assertEqual("+ \t \t \t^\n", diff[1]) 100 self.assertEqual("? \t \t \t +\n", diff[2]) 101 102 def test_mdiff_catch_stop_iteration(self): 103 # Issue #33224 104 self.assertEqual( 105 list(difflib._mdiff(["2"], ["3"], 1)), 106 [((1, '\x00-2\x01'), (1, '\x00+3\x01'), True)], 107 ) 108 109 110patch914575_from1 = """ 111 1. Beautiful is beTTer than ugly. 112 2. Explicit is better than implicit. 113 3. Simple is better than complex. 114 4. Complex is better than complicated. 115""" 116 117patch914575_to1 = """ 118 1. Beautiful is better than ugly. 119 3. Simple is better than complex. 120 4. Complicated is better than complex. 121 5. Flat is better than nested. 122""" 123 124patch914575_nonascii_from1 = """ 125 1. Beautiful is beTTer than ugly. 126 2. Explicit is better than ımplıcıt. 127 3. Simple is better than complex. 128 4. Complex is better than complicated. 129""" 130 131patch914575_nonascii_to1 = """ 132 1. Beautiful is better than ügly. 133 3. Sımple is better than complex. 134 4. Complicated is better than cömplex. 135 5. Flat is better than nested. 136""" 137 138patch914575_from2 = """ 139\t\tLine 1: preceded by from:[tt] to:[ssss] 140 \t\tLine 2: preceded by from:[sstt] to:[sssst] 141 \t \tLine 3: preceded by from:[sstst] to:[ssssss] 142Line 4: \thas from:[sst] to:[sss] after : 143Line 5: has from:[t] to:[ss] at end\t 144""" 145 146patch914575_to2 = """ 147 Line 1: preceded by from:[tt] to:[ssss] 148 \tLine 2: preceded by from:[sstt] to:[sssst] 149 Line 3: preceded by from:[sstst] to:[ssssss] 150Line 4: has from:[sst] to:[sss] after : 151Line 5: has from:[t] to:[ss] at end 152""" 153 154patch914575_from3 = """line 0 1551234567890123456789012345689012345 156line 1 157line 2 158line 3 159line 4 changed 160line 5 changed 161line 6 changed 162line 7 163line 8 subtracted 164line 9 1651234567890123456789012345689012345 166short line 167just fits in!! 168just fits in two lines yup!! 169the end""" 170 171patch914575_to3 = """line 0 1721234567890123456789012345689012345 173line 1 174line 2 added 175line 3 176line 4 chanGEd 177line 5a chanGed 178line 6a changEd 179line 7 180line 8 181line 9 1821234567890 183another long line that needs to be wrapped 184just fitS in!! 185just fits in two lineS yup!! 186the end""" 187 188class TestSFpatches(unittest.TestCase): 189 190 def test_html_diff(self): 191 # Check SF patch 914575 for generating HTML differences 192 f1a = ((patch914575_from1 + '123\n'*10)*3) 193 t1a = (patch914575_to1 + '123\n'*10)*3 194 f1b = '456\n'*10 + f1a 195 t1b = '456\n'*10 + t1a 196 f1a = f1a.splitlines() 197 t1a = t1a.splitlines() 198 f1b = f1b.splitlines() 199 t1b = t1b.splitlines() 200 f2 = patch914575_from2.splitlines() 201 t2 = patch914575_to2.splitlines() 202 f3 = patch914575_from3 203 t3 = patch914575_to3 204 i = difflib.HtmlDiff() 205 j = difflib.HtmlDiff(tabsize=2) 206 k = difflib.HtmlDiff(wrapcolumn=14) 207 208 full = i.make_file(f1a,t1a,'from','to',context=False,numlines=5) 209 tables = '\n'.join( 210 [ 211 '<h2>Context (first diff within numlines=5(default))</h2>', 212 i.make_table(f1a,t1a,'from','to',context=True), 213 '<h2>Context (first diff after numlines=5(default))</h2>', 214 i.make_table(f1b,t1b,'from','to',context=True), 215 '<h2>Context (numlines=6)</h2>', 216 i.make_table(f1a,t1a,'from','to',context=True,numlines=6), 217 '<h2>Context (numlines=0)</h2>', 218 i.make_table(f1a,t1a,'from','to',context=True,numlines=0), 219 '<h2>Same Context</h2>', 220 i.make_table(f1a,f1a,'from','to',context=True), 221 '<h2>Same Full</h2>', 222 i.make_table(f1a,f1a,'from','to',context=False), 223 '<h2>Empty Context</h2>', 224 i.make_table([],[],'from','to',context=True), 225 '<h2>Empty Full</h2>', 226 i.make_table([],[],'from','to',context=False), 227 '<h2>tabsize=2</h2>', 228 j.make_table(f2,t2), 229 '<h2>tabsize=default</h2>', 230 i.make_table(f2,t2), 231 '<h2>Context (wrapcolumn=14,numlines=0)</h2>', 232 k.make_table(f3.splitlines(),t3.splitlines(),context=True,numlines=0), 233 '<h2>wrapcolumn=14,splitlines()</h2>', 234 k.make_table(f3.splitlines(),t3.splitlines()), 235 '<h2>wrapcolumn=14,splitlines(True)</h2>', 236 k.make_table(f3.splitlines(True),t3.splitlines(True)), 237 ]) 238 actual = full.replace('</body>','\n%s\n</body>' % tables) 239 240 # temporarily uncomment next two lines to baseline this test 241 #with open('test_difflib_expect.html','w') as fp: 242 # fp.write(actual) 243 244 with open(findfile('test_difflib_expect.html')) as fp: 245 self.assertEqual(actual, fp.read()) 246 247 def test_recursion_limit(self): 248 # Check if the problem described in patch #1413711 exists. 249 limit = sys.getrecursionlimit() 250 old = [(i%2 and "K:%d" or "V:A:%d") % i for i in range(limit*2)] 251 new = [(i%2 and "K:%d" or "V:B:%d") % i for i in range(limit*2)] 252 difflib.SequenceMatcher(None, old, new).get_opcodes() 253 254 def test_make_file_default_charset(self): 255 html_diff = difflib.HtmlDiff() 256 output = html_diff.make_file(patch914575_from1.splitlines(), 257 patch914575_to1.splitlines()) 258 self.assertIn('content="text/html; charset=utf-8"', output) 259 260 def test_make_file_iso88591_charset(self): 261 html_diff = difflib.HtmlDiff() 262 output = html_diff.make_file(patch914575_from1.splitlines(), 263 patch914575_to1.splitlines(), 264 charset='iso-8859-1') 265 self.assertIn('content="text/html; charset=iso-8859-1"', output) 266 267 def test_make_file_usascii_charset_with_nonascii_input(self): 268 html_diff = difflib.HtmlDiff() 269 output = html_diff.make_file(patch914575_nonascii_from1.splitlines(), 270 patch914575_nonascii_to1.splitlines(), 271 charset='us-ascii') 272 self.assertIn('content="text/html; charset=us-ascii"', output) 273 self.assertIn('ımplıcıt', output) 274 275 276class TestOutputFormat(unittest.TestCase): 277 def test_tab_delimiter(self): 278 args = ['one', 'two', 'Original', 'Current', 279 '2005-01-26 23:30:50', '2010-04-02 10:20:52'] 280 ud = difflib.unified_diff(*args, lineterm='') 281 self.assertEqual(list(ud)[0:2], [ 282 "--- Original\t2005-01-26 23:30:50", 283 "+++ Current\t2010-04-02 10:20:52"]) 284 cd = difflib.context_diff(*args, lineterm='') 285 self.assertEqual(list(cd)[0:2], [ 286 "*** Original\t2005-01-26 23:30:50", 287 "--- Current\t2010-04-02 10:20:52"]) 288 289 def test_no_trailing_tab_on_empty_filedate(self): 290 args = ['one', 'two', 'Original', 'Current'] 291 ud = difflib.unified_diff(*args, lineterm='') 292 self.assertEqual(list(ud)[0:2], ["--- Original", "+++ Current"]) 293 294 cd = difflib.context_diff(*args, lineterm='') 295 self.assertEqual(list(cd)[0:2], ["*** Original", "--- Current"]) 296 297 def test_range_format_unified(self): 298 # Per the diff spec at http://www.unix.org/single_unix_specification/ 299 spec = '''\ 300 Each <range> field shall be of the form: 301 %1d", <beginning line number> if the range contains exactly one line, 302 and: 303 "%1d,%1d", <beginning line number>, <number of lines> otherwise. 304 If a range is empty, its beginning line number shall be the number of 305 the line just before the range, or 0 if the empty range starts the file. 306 ''' 307 fmt = difflib._format_range_unified 308 self.assertEqual(fmt(3,3), '3,0') 309 self.assertEqual(fmt(3,4), '4') 310 self.assertEqual(fmt(3,5), '4,2') 311 self.assertEqual(fmt(3,6), '4,3') 312 self.assertEqual(fmt(0,0), '0,0') 313 314 def test_range_format_context(self): 315 # Per the diff spec at http://www.unix.org/single_unix_specification/ 316 spec = '''\ 317 The range of lines in file1 shall be written in the following format 318 if the range contains two or more lines: 319 "*** %d,%d ****\n", <beginning line number>, <ending line number> 320 and the following format otherwise: 321 "*** %d ****\n", <ending line number> 322 The ending line number of an empty range shall be the number of the preceding line, 323 or 0 if the range is at the start of the file. 324 325 Next, the range of lines in file2 shall be written in the following format 326 if the range contains two or more lines: 327 "--- %d,%d ----\n", <beginning line number>, <ending line number> 328 and the following format otherwise: 329 "--- %d ----\n", <ending line number> 330 ''' 331 fmt = difflib._format_range_context 332 self.assertEqual(fmt(3,3), '3') 333 self.assertEqual(fmt(3,4), '4') 334 self.assertEqual(fmt(3,5), '4,5') 335 self.assertEqual(fmt(3,6), '4,6') 336 self.assertEqual(fmt(0,0), '0') 337 338 339class TestBytes(unittest.TestCase): 340 # don't really care about the content of the output, just the fact 341 # that it's bytes and we don't crash 342 def check(self, diff): 343 diff = list(diff) # trigger exceptions first 344 for line in diff: 345 self.assertIsInstance( 346 line, bytes, 347 "all lines of diff should be bytes, but got: %r" % line) 348 349 def test_byte_content(self): 350 # if we receive byte strings, we return byte strings 351 a = [b'hello', b'andr\xe9'] # iso-8859-1 bytes 352 b = [b'hello', b'andr\xc3\xa9'] # utf-8 bytes 353 354 unified = difflib.unified_diff 355 context = difflib.context_diff 356 357 check = self.check 358 check(difflib.diff_bytes(unified, a, a)) 359 check(difflib.diff_bytes(unified, a, b)) 360 361 # now with filenames (content and filenames are all bytes!) 362 check(difflib.diff_bytes(unified, a, a, b'a', b'a')) 363 check(difflib.diff_bytes(unified, a, b, b'a', b'b')) 364 365 # and with filenames and dates 366 check(difflib.diff_bytes(unified, a, a, b'a', b'a', b'2005', b'2013')) 367 check(difflib.diff_bytes(unified, a, b, b'a', b'b', b'2005', b'2013')) 368 369 # same all over again, with context diff 370 check(difflib.diff_bytes(context, a, a)) 371 check(difflib.diff_bytes(context, a, b)) 372 check(difflib.diff_bytes(context, a, a, b'a', b'a')) 373 check(difflib.diff_bytes(context, a, b, b'a', b'b')) 374 check(difflib.diff_bytes(context, a, a, b'a', b'a', b'2005', b'2013')) 375 check(difflib.diff_bytes(context, a, b, b'a', b'b', b'2005', b'2013')) 376 377 def test_byte_filenames(self): 378 # somebody renamed a file from ISO-8859-2 to UTF-8 379 fna = b'\xb3odz.txt' # "łodz.txt" 380 fnb = b'\xc5\x82odz.txt' 381 382 # they transcoded the content at the same time 383 a = [b'\xa3odz is a city in Poland.'] 384 b = [b'\xc5\x81odz is a city in Poland.'] 385 386 check = self.check 387 unified = difflib.unified_diff 388 context = difflib.context_diff 389 check(difflib.diff_bytes(unified, a, b, fna, fnb)) 390 check(difflib.diff_bytes(context, a, b, fna, fnb)) 391 392 def assertDiff(expect, actual): 393 # do not compare expect and equal as lists, because unittest 394 # uses difflib to report difference between lists 395 actual = list(actual) 396 self.assertEqual(len(expect), len(actual)) 397 for e, a in zip(expect, actual): 398 self.assertEqual(e, a) 399 400 expect = [ 401 b'--- \xb3odz.txt', 402 b'+++ \xc5\x82odz.txt', 403 b'@@ -1 +1 @@', 404 b'-\xa3odz is a city in Poland.', 405 b'+\xc5\x81odz is a city in Poland.', 406 ] 407 actual = difflib.diff_bytes(unified, a, b, fna, fnb, lineterm=b'') 408 assertDiff(expect, actual) 409 410 # with dates (plain ASCII) 411 datea = b'2005-03-18' 412 dateb = b'2005-03-19' 413 check(difflib.diff_bytes(unified, a, b, fna, fnb, datea, dateb)) 414 check(difflib.diff_bytes(context, a, b, fna, fnb, datea, dateb)) 415 416 expect = [ 417 # note the mixed encodings here: this is deeply wrong by every 418 # tenet of Unicode, but it doesn't crash, it's parseable by 419 # patch, and it's how UNIX(tm) diff behaves 420 b'--- \xb3odz.txt\t2005-03-18', 421 b'+++ \xc5\x82odz.txt\t2005-03-19', 422 b'@@ -1 +1 @@', 423 b'-\xa3odz is a city in Poland.', 424 b'+\xc5\x81odz is a city in Poland.', 425 ] 426 actual = difflib.diff_bytes(unified, a, b, fna, fnb, datea, dateb, 427 lineterm=b'') 428 assertDiff(expect, actual) 429 430 def test_mixed_types_content(self): 431 # type of input content must be consistent: all str or all bytes 432 a = [b'hello'] 433 b = ['hello'] 434 435 unified = difflib.unified_diff 436 context = difflib.context_diff 437 438 expect = "lines to compare must be str, not bytes (b'hello')" 439 self._assert_type_error(expect, unified, a, b) 440 self._assert_type_error(expect, unified, b, a) 441 self._assert_type_error(expect, context, a, b) 442 self._assert_type_error(expect, context, b, a) 443 444 expect = "all arguments must be bytes, not str ('hello')" 445 self._assert_type_error(expect, difflib.diff_bytes, unified, a, b) 446 self._assert_type_error(expect, difflib.diff_bytes, unified, b, a) 447 self._assert_type_error(expect, difflib.diff_bytes, context, a, b) 448 self._assert_type_error(expect, difflib.diff_bytes, context, b, a) 449 450 def test_mixed_types_filenames(self): 451 # cannot pass filenames as bytes if content is str (this may not be 452 # the right behaviour, but at least the test demonstrates how 453 # things work) 454 a = ['hello\n'] 455 b = ['ohell\n'] 456 fna = b'ol\xe9.txt' # filename transcoded from ISO-8859-1 457 fnb = b'ol\xc3a9.txt' # to UTF-8 458 self._assert_type_error( 459 "all arguments must be str, not: b'ol\\xe9.txt'", 460 difflib.unified_diff, a, b, fna, fnb) 461 462 def test_mixed_types_dates(self): 463 # type of dates must be consistent with type of contents 464 a = [b'foo\n'] 465 b = [b'bar\n'] 466 datea = '1 fév' 467 dateb = '3 fév' 468 self._assert_type_error( 469 "all arguments must be bytes, not str ('1 fév')", 470 difflib.diff_bytes, difflib.unified_diff, 471 a, b, b'a', b'b', datea, dateb) 472 473 # if input is str, non-ASCII dates are fine 474 a = ['foo\n'] 475 b = ['bar\n'] 476 list(difflib.unified_diff(a, b, 'a', 'b', datea, dateb)) 477 478 def _assert_type_error(self, msg, generator, *args): 479 with self.assertRaises(TypeError) as ctx: 480 list(generator(*args)) 481 self.assertEqual(msg, str(ctx.exception)) 482 483class TestJunkAPIs(unittest.TestCase): 484 def test_is_line_junk_true(self): 485 for line in ['#', ' ', ' #', '# ', ' # ', '']: 486 self.assertTrue(difflib.IS_LINE_JUNK(line), repr(line)) 487 488 def test_is_line_junk_false(self): 489 for line in ['##', ' ##', '## ', 'abc ', 'abc #', 'Mr. Moose is up!']: 490 self.assertFalse(difflib.IS_LINE_JUNK(line), repr(line)) 491 492 def test_is_line_junk_REDOS(self): 493 evil_input = ('\t' * 1000000) + '##' 494 self.assertFalse(difflib.IS_LINE_JUNK(evil_input)) 495 496 def test_is_character_junk_true(self): 497 for char in [' ', '\t']: 498 self.assertTrue(difflib.IS_CHARACTER_JUNK(char), repr(char)) 499 500 def test_is_character_junk_false(self): 501 for char in ['a', '#', '\n', '\f', '\r', '\v']: 502 self.assertFalse(difflib.IS_CHARACTER_JUNK(char), repr(char)) 503 504class TestFindLongest(unittest.TestCase): 505 def longer_match_exists(self, a, b, n): 506 return any(b_part in a for b_part in 507 [b[i:i + n + 1] for i in range(0, len(b) - n - 1)]) 508 509 def test_default_args(self): 510 a = 'foo bar' 511 b = 'foo baz bar' 512 sm = difflib.SequenceMatcher(a=a, b=b) 513 match = sm.find_longest_match() 514 self.assertEqual(match.a, 0) 515 self.assertEqual(match.b, 0) 516 self.assertEqual(match.size, 6) 517 self.assertEqual(a[match.a: match.a + match.size], 518 b[match.b: match.b + match.size]) 519 self.assertFalse(self.longer_match_exists(a, b, match.size)) 520 521 match = sm.find_longest_match(alo=2, blo=4) 522 self.assertEqual(match.a, 3) 523 self.assertEqual(match.b, 7) 524 self.assertEqual(match.size, 4) 525 self.assertEqual(a[match.a: match.a + match.size], 526 b[match.b: match.b + match.size]) 527 self.assertFalse(self.longer_match_exists(a[2:], b[4:], match.size)) 528 529 match = sm.find_longest_match(bhi=5, blo=1) 530 self.assertEqual(match.a, 1) 531 self.assertEqual(match.b, 1) 532 self.assertEqual(match.size, 4) 533 self.assertEqual(a[match.a: match.a + match.size], 534 b[match.b: match.b + match.size]) 535 self.assertFalse(self.longer_match_exists(a, b[1:5], match.size)) 536 537 def test_longest_match_with_popular_chars(self): 538 a = 'dabcd' 539 b = 'd'*100 + 'abc' + 'd'*100 # length over 200 so popular used 540 sm = difflib.SequenceMatcher(a=a, b=b) 541 match = sm.find_longest_match(0, len(a), 0, len(b)) 542 self.assertEqual(match.a, 0) 543 self.assertEqual(match.b, 99) 544 self.assertEqual(match.size, 5) 545 self.assertEqual(a[match.a: match.a + match.size], 546 b[match.b: match.b + match.size]) 547 self.assertFalse(self.longer_match_exists(a, b, match.size)) 548 549 550def test_main(): 551 difflib.HtmlDiff._default_prefix = 0 552 Doctests = doctest.DocTestSuite(difflib) 553 run_unittest( 554 TestWithAscii, TestAutojunk, TestSFpatches, TestSFbugs, 555 TestOutputFormat, TestBytes, TestJunkAPIs, TestFindLongest, Doctests) 556 557if __name__ == '__main__': 558 test_main() 559