1""" 2Module difflib -- helpers for computing deltas between objects. 3 4Function get_close_matches(word, possibilities, n=3, cutoff=0.6): 5 Use SequenceMatcher to return list of the best "good enough" matches. 6 7Function context_diff(a, b): 8 For two lists of strings, return a delta in context diff format. 9 10Function ndiff(a, b): 11 Return a delta: the difference between `a` and `b` (lists of strings). 12 13Function restore(delta, which): 14 Return one of the two sequences that generated an ndiff delta. 15 16Function unified_diff(a, b): 17 For two lists of strings, return a delta in unified diff format. 18 19Class SequenceMatcher: 20 A flexible class for comparing pairs of sequences of any type. 21 22Class Differ: 23 For producing human-readable deltas from sequences of lines of text. 24 25Class HtmlDiff: 26 For producing HTML side by side comparison with change highlights. 27""" 28 29__all__ = ['get_close_matches', 'ndiff', 'restore', 'SequenceMatcher', 30 'Differ','IS_CHARACTER_JUNK', 'IS_LINE_JUNK', 'context_diff', 31 'unified_diff', 'diff_bytes', 'HtmlDiff', 'Match'] 32 33from heapq import nlargest as _nlargest 34from collections import namedtuple as _namedtuple 35from types import GenericAlias 36 37Match = _namedtuple('Match', 'a b size') 38 39def _calculate_ratio(matches, length): 40 if length: 41 return 2.0 * matches / length 42 return 1.0 43 44class SequenceMatcher: 45 46 """ 47 SequenceMatcher is a flexible class for comparing pairs of sequences of 48 any type, so long as the sequence elements are hashable. The basic 49 algorithm predates, and is a little fancier than, an algorithm 50 published in the late 1980's by Ratcliff and Obershelp under the 51 hyperbolic name "gestalt pattern matching". The basic idea is to find 52 the longest contiguous matching subsequence that contains no "junk" 53 elements (R-O doesn't address junk). The same idea is then applied 54 recursively to the pieces of the sequences to the left and to the right 55 of the matching subsequence. This does not yield minimal edit 56 sequences, but does tend to yield matches that "look right" to people. 57 58 SequenceMatcher tries to compute a "human-friendly diff" between two 59 sequences. Unlike e.g. UNIX(tm) diff, the fundamental notion is the 60 longest *contiguous* & junk-free matching subsequence. That's what 61 catches peoples' eyes. The Windows(tm) windiff has another interesting 62 notion, pairing up elements that appear uniquely in each sequence. 63 That, and the method here, appear to yield more intuitive difference 64 reports than does diff. This method appears to be the least vulnerable 65 to syncing up on blocks of "junk lines", though (like blank lines in 66 ordinary text files, or maybe "<P>" lines in HTML files). That may be 67 because this is the only method of the 3 that has a *concept* of 68 "junk" <wink>. 69 70 Example, comparing two strings, and considering blanks to be "junk": 71 72 >>> s = SequenceMatcher(lambda x: x == " ", 73 ... "private Thread currentThread;", 74 ... "private volatile Thread currentThread;") 75 >>> 76 77 .ratio() returns a float in [0, 1], measuring the "similarity" of the 78 sequences. As a rule of thumb, a .ratio() value over 0.6 means the 79 sequences are close matches: 80 81 >>> print(round(s.ratio(), 3)) 82 0.866 83 >>> 84 85 If you're only interested in where the sequences match, 86 .get_matching_blocks() is handy: 87 88 >>> for block in s.get_matching_blocks(): 89 ... print("a[%d] and b[%d] match for %d elements" % block) 90 a[0] and b[0] match for 8 elements 91 a[8] and b[17] match for 21 elements 92 a[29] and b[38] match for 0 elements 93 94 Note that the last tuple returned by .get_matching_blocks() is always a 95 dummy, (len(a), len(b), 0), and this is the only case in which the last 96 tuple element (number of elements matched) is 0. 97 98 If you want to know how to change the first sequence into the second, 99 use .get_opcodes(): 100 101 >>> for opcode in s.get_opcodes(): 102 ... print("%6s a[%d:%d] b[%d:%d]" % opcode) 103 equal a[0:8] b[0:8] 104 insert a[8:8] b[8:17] 105 equal a[8:29] b[17:38] 106 107 See the Differ class for a fancy human-friendly file differencer, which 108 uses SequenceMatcher both to compare sequences of lines, and to compare 109 sequences of characters within similar (near-matching) lines. 110 111 See also function get_close_matches() in this module, which shows how 112 simple code building on SequenceMatcher can be used to do useful work. 113 114 Timing: Basic R-O is cubic time worst case and quadratic time expected 115 case. SequenceMatcher is quadratic time for the worst case and has 116 expected-case behavior dependent in a complicated way on how many 117 elements the sequences have in common; best case time is linear. 118 """ 119 120 def __init__(self, isjunk=None, a='', b='', autojunk=True): 121 """Construct a SequenceMatcher. 122 123 Optional arg isjunk is None (the default), or a one-argument 124 function that takes a sequence element and returns true iff the 125 element is junk. None is equivalent to passing "lambda x: 0", i.e. 126 no elements are considered to be junk. For example, pass 127 lambda x: x in " \\t" 128 if you're comparing lines as sequences of characters, and don't 129 want to synch up on blanks or hard tabs. 130 131 Optional arg a is the first of two sequences to be compared. By 132 default, an empty string. The elements of a must be hashable. See 133 also .set_seqs() and .set_seq1(). 134 135 Optional arg b is the second of two sequences to be compared. By 136 default, an empty string. The elements of b must be hashable. See 137 also .set_seqs() and .set_seq2(). 138 139 Optional arg autojunk should be set to False to disable the 140 "automatic junk heuristic" that treats popular elements as junk 141 (see module documentation for more information). 142 """ 143 144 # Members: 145 # a 146 # first sequence 147 # b 148 # second sequence; differences are computed as "what do 149 # we need to do to 'a' to change it into 'b'?" 150 # b2j 151 # for x in b, b2j[x] is a list of the indices (into b) 152 # at which x appears; junk and popular elements do not appear 153 # fullbcount 154 # for x in b, fullbcount[x] == the number of times x 155 # appears in b; only materialized if really needed (used 156 # only for computing quick_ratio()) 157 # matching_blocks 158 # a list of (i, j, k) triples, where a[i:i+k] == b[j:j+k]; 159 # ascending & non-overlapping in i and in j; terminated by 160 # a dummy (len(a), len(b), 0) sentinel 161 # opcodes 162 # a list of (tag, i1, i2, j1, j2) tuples, where tag is 163 # one of 164 # 'replace' a[i1:i2] should be replaced by b[j1:j2] 165 # 'delete' a[i1:i2] should be deleted 166 # 'insert' b[j1:j2] should be inserted 167 # 'equal' a[i1:i2] == b[j1:j2] 168 # isjunk 169 # a user-supplied function taking a sequence element and 170 # returning true iff the element is "junk" -- this has 171 # subtle but helpful effects on the algorithm, which I'll 172 # get around to writing up someday <0.9 wink>. 173 # DON'T USE! Only __chain_b uses this. Use "in self.bjunk". 174 # bjunk 175 # the items in b for which isjunk is True. 176 # bpopular 177 # nonjunk items in b treated as junk by the heuristic (if used). 178 179 self.isjunk = isjunk 180 self.a = self.b = None 181 self.autojunk = autojunk 182 self.set_seqs(a, b) 183 184 def set_seqs(self, a, b): 185 """Set the two sequences to be compared. 186 187 >>> s = SequenceMatcher() 188 >>> s.set_seqs("abcd", "bcde") 189 >>> s.ratio() 190 0.75 191 """ 192 193 self.set_seq1(a) 194 self.set_seq2(b) 195 196 def set_seq1(self, a): 197 """Set the first sequence to be compared. 198 199 The second sequence to be compared is not changed. 200 201 >>> s = SequenceMatcher(None, "abcd", "bcde") 202 >>> s.ratio() 203 0.75 204 >>> s.set_seq1("bcde") 205 >>> s.ratio() 206 1.0 207 >>> 208 209 SequenceMatcher computes and caches detailed information about the 210 second sequence, so if you want to compare one sequence S against 211 many sequences, use .set_seq2(S) once and call .set_seq1(x) 212 repeatedly for each of the other sequences. 213 214 See also set_seqs() and set_seq2(). 215 """ 216 217 if a is self.a: 218 return 219 self.a = a 220 self.matching_blocks = self.opcodes = None 221 222 def set_seq2(self, b): 223 """Set the second sequence to be compared. 224 225 The first sequence to be compared is not changed. 226 227 >>> s = SequenceMatcher(None, "abcd", "bcde") 228 >>> s.ratio() 229 0.75 230 >>> s.set_seq2("abcd") 231 >>> s.ratio() 232 1.0 233 >>> 234 235 SequenceMatcher computes and caches detailed information about the 236 second sequence, so if you want to compare one sequence S against 237 many sequences, use .set_seq2(S) once and call .set_seq1(x) 238 repeatedly for each of the other sequences. 239 240 See also set_seqs() and set_seq1(). 241 """ 242 243 if b is self.b: 244 return 245 self.b = b 246 self.matching_blocks = self.opcodes = None 247 self.fullbcount = None 248 self.__chain_b() 249 250 # For each element x in b, set b2j[x] to a list of the indices in 251 # b where x appears; the indices are in increasing order; note that 252 # the number of times x appears in b is len(b2j[x]) ... 253 # when self.isjunk is defined, junk elements don't show up in this 254 # map at all, which stops the central find_longest_match method 255 # from starting any matching block at a junk element ... 256 # b2j also does not contain entries for "popular" elements, meaning 257 # elements that account for more than 1 + 1% of the total elements, and 258 # when the sequence is reasonably large (>= 200 elements); this can 259 # be viewed as an adaptive notion of semi-junk, and yields an enormous 260 # speedup when, e.g., comparing program files with hundreds of 261 # instances of "return NULL;" ... 262 # note that this is only called when b changes; so for cross-product 263 # kinds of matches, it's best to call set_seq2 once, then set_seq1 264 # repeatedly 265 266 def __chain_b(self): 267 # Because isjunk is a user-defined (not C) function, and we test 268 # for junk a LOT, it's important to minimize the number of calls. 269 # Before the tricks described here, __chain_b was by far the most 270 # time-consuming routine in the whole module! If anyone sees 271 # Jim Roskind, thank him again for profile.py -- I never would 272 # have guessed that. 273 # The first trick is to build b2j ignoring the possibility 274 # of junk. I.e., we don't call isjunk at all yet. Throwing 275 # out the junk later is much cheaper than building b2j "right" 276 # from the start. 277 b = self.b 278 self.b2j = b2j = {} 279 280 for i, elt in enumerate(b): 281 indices = b2j.setdefault(elt, []) 282 indices.append(i) 283 284 # Purge junk elements 285 self.bjunk = junk = set() 286 isjunk = self.isjunk 287 if isjunk: 288 for elt in b2j.keys(): 289 if isjunk(elt): 290 junk.add(elt) 291 for elt in junk: # separate loop avoids separate list of keys 292 del b2j[elt] 293 294 # Purge popular elements that are not junk 295 self.bpopular = popular = set() 296 n = len(b) 297 if self.autojunk and n >= 200: 298 ntest = n // 100 + 1 299 for elt, idxs in b2j.items(): 300 if len(idxs) > ntest: 301 popular.add(elt) 302 for elt in popular: # ditto; as fast for 1% deletion 303 del b2j[elt] 304 305 def find_longest_match(self, alo=0, ahi=None, blo=0, bhi=None): 306 """Find longest matching block in a[alo:ahi] and b[blo:bhi]. 307 308 By default it will find the longest match in the entirety of a and b. 309 310 If isjunk is not defined: 311 312 Return (i,j,k) such that a[i:i+k] is equal to b[j:j+k], where 313 alo <= i <= i+k <= ahi 314 blo <= j <= j+k <= bhi 315 and for all (i',j',k') meeting those conditions, 316 k >= k' 317 i <= i' 318 and if i == i', j <= j' 319 320 In other words, of all maximal matching blocks, return one that 321 starts earliest in a, and of all those maximal matching blocks that 322 start earliest in a, return the one that starts earliest in b. 323 324 >>> s = SequenceMatcher(None, " abcd", "abcd abcd") 325 >>> s.find_longest_match(0, 5, 0, 9) 326 Match(a=0, b=4, size=5) 327 328 If isjunk is defined, first the longest matching block is 329 determined as above, but with the additional restriction that no 330 junk element appears in the block. Then that block is extended as 331 far as possible by matching (only) junk elements on both sides. So 332 the resulting block never matches on junk except as identical junk 333 happens to be adjacent to an "interesting" match. 334 335 Here's the same example as before, but considering blanks to be 336 junk. That prevents " abcd" from matching the " abcd" at the tail 337 end of the second sequence directly. Instead only the "abcd" can 338 match, and matches the leftmost "abcd" in the second sequence: 339 340 >>> s = SequenceMatcher(lambda x: x==" ", " abcd", "abcd abcd") 341 >>> s.find_longest_match(0, 5, 0, 9) 342 Match(a=1, b=0, size=4) 343 344 If no blocks match, return (alo, blo, 0). 345 346 >>> s = SequenceMatcher(None, "ab", "c") 347 >>> s.find_longest_match(0, 2, 0, 1) 348 Match(a=0, b=0, size=0) 349 """ 350 351 # CAUTION: stripping common prefix or suffix would be incorrect. 352 # E.g., 353 # ab 354 # acab 355 # Longest matching block is "ab", but if common prefix is 356 # stripped, it's "a" (tied with "b"). UNIX(tm) diff does so 357 # strip, so ends up claiming that ab is changed to acab by 358 # inserting "ca" in the middle. That's minimal but unintuitive: 359 # "it's obvious" that someone inserted "ac" at the front. 360 # Windiff ends up at the same place as diff, but by pairing up 361 # the unique 'b's and then matching the first two 'a's. 362 363 a, b, b2j, isbjunk = self.a, self.b, self.b2j, self.bjunk.__contains__ 364 if ahi is None: 365 ahi = len(a) 366 if bhi is None: 367 bhi = len(b) 368 besti, bestj, bestsize = alo, blo, 0 369 # find longest junk-free match 370 # during an iteration of the loop, j2len[j] = length of longest 371 # junk-free match ending with a[i-1] and b[j] 372 j2len = {} 373 nothing = [] 374 for i in range(alo, ahi): 375 # look at all instances of a[i] in b; note that because 376 # b2j has no junk keys, the loop is skipped if a[i] is junk 377 j2lenget = j2len.get 378 newj2len = {} 379 for j in b2j.get(a[i], nothing): 380 # a[i] matches b[j] 381 if j < blo: 382 continue 383 if j >= bhi: 384 break 385 k = newj2len[j] = j2lenget(j-1, 0) + 1 386 if k > bestsize: 387 besti, bestj, bestsize = i-k+1, j-k+1, k 388 j2len = newj2len 389 390 # Extend the best by non-junk elements on each end. In particular, 391 # "popular" non-junk elements aren't in b2j, which greatly speeds 392 # the inner loop above, but also means "the best" match so far 393 # doesn't contain any junk *or* popular non-junk elements. 394 while besti > alo and bestj > blo and \ 395 not isbjunk(b[bestj-1]) and \ 396 a[besti-1] == b[bestj-1]: 397 besti, bestj, bestsize = besti-1, bestj-1, bestsize+1 398 while besti+bestsize < ahi and bestj+bestsize < bhi and \ 399 not isbjunk(b[bestj+bestsize]) and \ 400 a[besti+bestsize] == b[bestj+bestsize]: 401 bestsize += 1 402 403 # Now that we have a wholly interesting match (albeit possibly 404 # empty!), we may as well suck up the matching junk on each 405 # side of it too. Can't think of a good reason not to, and it 406 # saves post-processing the (possibly considerable) expense of 407 # figuring out what to do with it. In the case of an empty 408 # interesting match, this is clearly the right thing to do, 409 # because no other kind of match is possible in the regions. 410 while besti > alo and bestj > blo and \ 411 isbjunk(b[bestj-1]) and \ 412 a[besti-1] == b[bestj-1]: 413 besti, bestj, bestsize = besti-1, bestj-1, bestsize+1 414 while besti+bestsize < ahi and bestj+bestsize < bhi and \ 415 isbjunk(b[bestj+bestsize]) and \ 416 a[besti+bestsize] == b[bestj+bestsize]: 417 bestsize = bestsize + 1 418 419 return Match(besti, bestj, bestsize) 420 421 def get_matching_blocks(self): 422 """Return list of triples describing matching subsequences. 423 424 Each triple is of the form (i, j, n), and means that 425 a[i:i+n] == b[j:j+n]. The triples are monotonically increasing in 426 i and in j. New in Python 2.5, it's also guaranteed that if 427 (i, j, n) and (i', j', n') are adjacent triples in the list, and 428 the second is not the last triple in the list, then i+n != i' or 429 j+n != j'. IOW, adjacent triples never describe adjacent equal 430 blocks. 431 432 The last triple is a dummy, (len(a), len(b), 0), and is the only 433 triple with n==0. 434 435 >>> s = SequenceMatcher(None, "abxcd", "abcd") 436 >>> list(s.get_matching_blocks()) 437 [Match(a=0, b=0, size=2), Match(a=3, b=2, size=2), Match(a=5, b=4, size=0)] 438 """ 439 440 if self.matching_blocks is not None: 441 return self.matching_blocks 442 la, lb = len(self.a), len(self.b) 443 444 # This is most naturally expressed as a recursive algorithm, but 445 # at least one user bumped into extreme use cases that exceeded 446 # the recursion limit on their box. So, now we maintain a list 447 # ('queue`) of blocks we still need to look at, and append partial 448 # results to `matching_blocks` in a loop; the matches are sorted 449 # at the end. 450 queue = [(0, la, 0, lb)] 451 matching_blocks = [] 452 while queue: 453 alo, ahi, blo, bhi = queue.pop() 454 i, j, k = x = self.find_longest_match(alo, ahi, blo, bhi) 455 # a[alo:i] vs b[blo:j] unknown 456 # a[i:i+k] same as b[j:j+k] 457 # a[i+k:ahi] vs b[j+k:bhi] unknown 458 if k: # if k is 0, there was no matching block 459 matching_blocks.append(x) 460 if alo < i and blo < j: 461 queue.append((alo, i, blo, j)) 462 if i+k < ahi and j+k < bhi: 463 queue.append((i+k, ahi, j+k, bhi)) 464 matching_blocks.sort() 465 466 # It's possible that we have adjacent equal blocks in the 467 # matching_blocks list now. Starting with 2.5, this code was added 468 # to collapse them. 469 i1 = j1 = k1 = 0 470 non_adjacent = [] 471 for i2, j2, k2 in matching_blocks: 472 # Is this block adjacent to i1, j1, k1? 473 if i1 + k1 == i2 and j1 + k1 == j2: 474 # Yes, so collapse them -- this just increases the length of 475 # the first block by the length of the second, and the first 476 # block so lengthened remains the block to compare against. 477 k1 += k2 478 else: 479 # Not adjacent. Remember the first block (k1==0 means it's 480 # the dummy we started with), and make the second block the 481 # new block to compare against. 482 if k1: 483 non_adjacent.append((i1, j1, k1)) 484 i1, j1, k1 = i2, j2, k2 485 if k1: 486 non_adjacent.append((i1, j1, k1)) 487 488 non_adjacent.append( (la, lb, 0) ) 489 self.matching_blocks = list(map(Match._make, non_adjacent)) 490 return self.matching_blocks 491 492 def get_opcodes(self): 493 """Return list of 5-tuples describing how to turn a into b. 494 495 Each tuple is of the form (tag, i1, i2, j1, j2). The first tuple 496 has i1 == j1 == 0, and remaining tuples have i1 == the i2 from the 497 tuple preceding it, and likewise for j1 == the previous j2. 498 499 The tags are strings, with these meanings: 500 501 'replace': a[i1:i2] should be replaced by b[j1:j2] 502 'delete': a[i1:i2] should be deleted. 503 Note that j1==j2 in this case. 504 'insert': b[j1:j2] should be inserted at a[i1:i1]. 505 Note that i1==i2 in this case. 506 'equal': a[i1:i2] == b[j1:j2] 507 508 >>> a = "qabxcd" 509 >>> b = "abycdf" 510 >>> s = SequenceMatcher(None, a, b) 511 >>> for tag, i1, i2, j1, j2 in s.get_opcodes(): 512 ... print(("%7s a[%d:%d] (%s) b[%d:%d] (%s)" % 513 ... (tag, i1, i2, a[i1:i2], j1, j2, b[j1:j2]))) 514 delete a[0:1] (q) b[0:0] () 515 equal a[1:3] (ab) b[0:2] (ab) 516 replace a[3:4] (x) b[2:3] (y) 517 equal a[4:6] (cd) b[3:5] (cd) 518 insert a[6:6] () b[5:6] (f) 519 """ 520 521 if self.opcodes is not None: 522 return self.opcodes 523 i = j = 0 524 self.opcodes = answer = [] 525 for ai, bj, size in self.get_matching_blocks(): 526 # invariant: we've pumped out correct diffs to change 527 # a[:i] into b[:j], and the next matching block is 528 # a[ai:ai+size] == b[bj:bj+size]. So we need to pump 529 # out a diff to change a[i:ai] into b[j:bj], pump out 530 # the matching block, and move (i,j) beyond the match 531 tag = '' 532 if i < ai and j < bj: 533 tag = 'replace' 534 elif i < ai: 535 tag = 'delete' 536 elif j < bj: 537 tag = 'insert' 538 if tag: 539 answer.append( (tag, i, ai, j, bj) ) 540 i, j = ai+size, bj+size 541 # the list of matching blocks is terminated by a 542 # sentinel with size 0 543 if size: 544 answer.append( ('equal', ai, i, bj, j) ) 545 return answer 546 547 def get_grouped_opcodes(self, n=3): 548 """ Isolate change clusters by eliminating ranges with no changes. 549 550 Return a generator of groups with up to n lines of context. 551 Each group is in the same format as returned by get_opcodes(). 552 553 >>> from pprint import pprint 554 >>> a = list(map(str, range(1,40))) 555 >>> b = a[:] 556 >>> b[8:8] = ['i'] # Make an insertion 557 >>> b[20] += 'x' # Make a replacement 558 >>> b[23:28] = [] # Make a deletion 559 >>> b[30] += 'y' # Make another replacement 560 >>> pprint(list(SequenceMatcher(None,a,b).get_grouped_opcodes())) 561 [[('equal', 5, 8, 5, 8), ('insert', 8, 8, 8, 9), ('equal', 8, 11, 9, 12)], 562 [('equal', 16, 19, 17, 20), 563 ('replace', 19, 20, 20, 21), 564 ('equal', 20, 22, 21, 23), 565 ('delete', 22, 27, 23, 23), 566 ('equal', 27, 30, 23, 26)], 567 [('equal', 31, 34, 27, 30), 568 ('replace', 34, 35, 30, 31), 569 ('equal', 35, 38, 31, 34)]] 570 """ 571 572 codes = self.get_opcodes() 573 if not codes: 574 codes = [("equal", 0, 1, 0, 1)] 575 # Fixup leading and trailing groups if they show no changes. 576 if codes[0][0] == 'equal': 577 tag, i1, i2, j1, j2 = codes[0] 578 codes[0] = tag, max(i1, i2-n), i2, max(j1, j2-n), j2 579 if codes[-1][0] == 'equal': 580 tag, i1, i2, j1, j2 = codes[-1] 581 codes[-1] = tag, i1, min(i2, i1+n), j1, min(j2, j1+n) 582 583 nn = n + n 584 group = [] 585 for tag, i1, i2, j1, j2 in codes: 586 # End the current group and start a new one whenever 587 # there is a large range with no changes. 588 if tag == 'equal' and i2-i1 > nn: 589 group.append((tag, i1, min(i2, i1+n), j1, min(j2, j1+n))) 590 yield group 591 group = [] 592 i1, j1 = max(i1, i2-n), max(j1, j2-n) 593 group.append((tag, i1, i2, j1 ,j2)) 594 if group and not (len(group)==1 and group[0][0] == 'equal'): 595 yield group 596 597 def ratio(self): 598 """Return a measure of the sequences' similarity (float in [0,1]). 599 600 Where T is the total number of elements in both sequences, and 601 M is the number of matches, this is 2.0*M / T. 602 Note that this is 1 if the sequences are identical, and 0 if 603 they have nothing in common. 604 605 .ratio() is expensive to compute if you haven't already computed 606 .get_matching_blocks() or .get_opcodes(), in which case you may 607 want to try .quick_ratio() or .real_quick_ratio() first to get an 608 upper bound. 609 610 >>> s = SequenceMatcher(None, "abcd", "bcde") 611 >>> s.ratio() 612 0.75 613 >>> s.quick_ratio() 614 0.75 615 >>> s.real_quick_ratio() 616 1.0 617 """ 618 619 matches = sum(triple[-1] for triple in self.get_matching_blocks()) 620 return _calculate_ratio(matches, len(self.a) + len(self.b)) 621 622 def quick_ratio(self): 623 """Return an upper bound on ratio() relatively quickly. 624 625 This isn't defined beyond that it is an upper bound on .ratio(), and 626 is faster to compute. 627 """ 628 629 # viewing a and b as multisets, set matches to the cardinality 630 # of their intersection; this counts the number of matches 631 # without regard to order, so is clearly an upper bound 632 if self.fullbcount is None: 633 self.fullbcount = fullbcount = {} 634 for elt in self.b: 635 fullbcount[elt] = fullbcount.get(elt, 0) + 1 636 fullbcount = self.fullbcount 637 # avail[x] is the number of times x appears in 'b' less the 638 # number of times we've seen it in 'a' so far ... kinda 639 avail = {} 640 availhas, matches = avail.__contains__, 0 641 for elt in self.a: 642 if availhas(elt): 643 numb = avail[elt] 644 else: 645 numb = fullbcount.get(elt, 0) 646 avail[elt] = numb - 1 647 if numb > 0: 648 matches = matches + 1 649 return _calculate_ratio(matches, len(self.a) + len(self.b)) 650 651 def real_quick_ratio(self): 652 """Return an upper bound on ratio() very quickly. 653 654 This isn't defined beyond that it is an upper bound on .ratio(), and 655 is faster to compute than either .ratio() or .quick_ratio(). 656 """ 657 658 la, lb = len(self.a), len(self.b) 659 # can't have more matches than the number of elements in the 660 # shorter sequence 661 return _calculate_ratio(min(la, lb), la + lb) 662 663 __class_getitem__ = classmethod(GenericAlias) 664 665 666def get_close_matches(word, possibilities, n=3, cutoff=0.6): 667 """Use SequenceMatcher to return list of the best "good enough" matches. 668 669 word is a sequence for which close matches are desired (typically a 670 string). 671 672 possibilities is a list of sequences against which to match word 673 (typically a list of strings). 674 675 Optional arg n (default 3) is the maximum number of close matches to 676 return. n must be > 0. 677 678 Optional arg cutoff (default 0.6) is a float in [0, 1]. Possibilities 679 that don't score at least that similar to word are ignored. 680 681 The best (no more than n) matches among the possibilities are returned 682 in a list, sorted by similarity score, most similar first. 683 684 >>> get_close_matches("appel", ["ape", "apple", "peach", "puppy"]) 685 ['apple', 'ape'] 686 >>> import keyword as _keyword 687 >>> get_close_matches("wheel", _keyword.kwlist) 688 ['while'] 689 >>> get_close_matches("Apple", _keyword.kwlist) 690 [] 691 >>> get_close_matches("accept", _keyword.kwlist) 692 ['except'] 693 """ 694 695 if not n > 0: 696 raise ValueError("n must be > 0: %r" % (n,)) 697 if not 0.0 <= cutoff <= 1.0: 698 raise ValueError("cutoff must be in [0.0, 1.0]: %r" % (cutoff,)) 699 result = [] 700 s = SequenceMatcher() 701 s.set_seq2(word) 702 for x in possibilities: 703 s.set_seq1(x) 704 if s.real_quick_ratio() >= cutoff and \ 705 s.quick_ratio() >= cutoff and \ 706 s.ratio() >= cutoff: 707 result.append((s.ratio(), x)) 708 709 # Move the best scorers to head of list 710 result = _nlargest(n, result) 711 # Strip scores for the best n matches 712 return [x for score, x in result] 713 714 715def _keep_original_ws(s, tag_s): 716 """Replace whitespace with the original whitespace characters in `s`""" 717 return ''.join( 718 c if tag_c == " " and c.isspace() else tag_c 719 for c, tag_c in zip(s, tag_s) 720 ) 721 722 723 724class Differ: 725 r""" 726 Differ is a class for comparing sequences of lines of text, and 727 producing human-readable differences or deltas. Differ uses 728 SequenceMatcher both to compare sequences of lines, and to compare 729 sequences of characters within similar (near-matching) lines. 730 731 Each line of a Differ delta begins with a two-letter code: 732 733 '- ' line unique to sequence 1 734 '+ ' line unique to sequence 2 735 ' ' line common to both sequences 736 '? ' line not present in either input sequence 737 738 Lines beginning with '? ' attempt to guide the eye to intraline 739 differences, and were not present in either input sequence. These lines 740 can be confusing if the sequences contain tab characters. 741 742 Note that Differ makes no claim to produce a *minimal* diff. To the 743 contrary, minimal diffs are often counter-intuitive, because they synch 744 up anywhere possible, sometimes accidental matches 100 pages apart. 745 Restricting synch points to contiguous matches preserves some notion of 746 locality, at the occasional cost of producing a longer diff. 747 748 Example: Comparing two texts. 749 750 First we set up the texts, sequences of individual single-line strings 751 ending with newlines (such sequences can also be obtained from the 752 `readlines()` method of file-like objects): 753 754 >>> text1 = ''' 1. Beautiful is better than ugly. 755 ... 2. Explicit is better than implicit. 756 ... 3. Simple is better than complex. 757 ... 4. Complex is better than complicated. 758 ... '''.splitlines(keepends=True) 759 >>> len(text1) 760 4 761 >>> text1[0][-1] 762 '\n' 763 >>> text2 = ''' 1. Beautiful is better than ugly. 764 ... 3. Simple is better than complex. 765 ... 4. Complicated is better than complex. 766 ... 5. Flat is better than nested. 767 ... '''.splitlines(keepends=True) 768 769 Next we instantiate a Differ object: 770 771 >>> d = Differ() 772 773 Note that when instantiating a Differ object we may pass functions to 774 filter out line and character 'junk'. See Differ.__init__ for details. 775 776 Finally, we compare the two: 777 778 >>> result = list(d.compare(text1, text2)) 779 780 'result' is a list of strings, so let's pretty-print it: 781 782 >>> from pprint import pprint as _pprint 783 >>> _pprint(result) 784 [' 1. Beautiful is better than ugly.\n', 785 '- 2. Explicit is better than implicit.\n', 786 '- 3. Simple is better than complex.\n', 787 '+ 3. Simple is better than complex.\n', 788 '? ++\n', 789 '- 4. Complex is better than complicated.\n', 790 '? ^ ---- ^\n', 791 '+ 4. Complicated is better than complex.\n', 792 '? ++++ ^ ^\n', 793 '+ 5. Flat is better than nested.\n'] 794 795 As a single multi-line string it looks like this: 796 797 >>> print(''.join(result), end="") 798 1. Beautiful is better than ugly. 799 - 2. Explicit is better than implicit. 800 - 3. Simple is better than complex. 801 + 3. Simple is better than complex. 802 ? ++ 803 - 4. Complex is better than complicated. 804 ? ^ ---- ^ 805 + 4. Complicated is better than complex. 806 ? ++++ ^ ^ 807 + 5. Flat is better than nested. 808 """ 809 810 def __init__(self, linejunk=None, charjunk=None): 811 """ 812 Construct a text differencer, with optional filters. 813 814 The two optional keyword parameters are for filter functions: 815 816 - `linejunk`: A function that should accept a single string argument, 817 and return true iff the string is junk. The module-level function 818 `IS_LINE_JUNK` may be used to filter out lines without visible 819 characters, except for at most one splat ('#'). It is recommended 820 to leave linejunk None; the underlying SequenceMatcher class has 821 an adaptive notion of "noise" lines that's better than any static 822 definition the author has ever been able to craft. 823 824 - `charjunk`: A function that should accept a string of length 1. The 825 module-level function `IS_CHARACTER_JUNK` may be used to filter out 826 whitespace characters (a blank or tab; **note**: bad idea to include 827 newline in this!). Use of IS_CHARACTER_JUNK is recommended. 828 """ 829 830 self.linejunk = linejunk 831 self.charjunk = charjunk 832 833 def compare(self, a, b): 834 r""" 835 Compare two sequences of lines; generate the resulting delta. 836 837 Each sequence must contain individual single-line strings ending with 838 newlines. Such sequences can be obtained from the `readlines()` method 839 of file-like objects. The delta generated also consists of newline- 840 terminated strings, ready to be printed as-is via the writeline() 841 method of a file-like object. 842 843 Example: 844 845 >>> print(''.join(Differ().compare('one\ntwo\nthree\n'.splitlines(True), 846 ... 'ore\ntree\nemu\n'.splitlines(True))), 847 ... end="") 848 - one 849 ? ^ 850 + ore 851 ? ^ 852 - two 853 - three 854 ? - 855 + tree 856 + emu 857 """ 858 859 cruncher = SequenceMatcher(self.linejunk, a, b) 860 for tag, alo, ahi, blo, bhi in cruncher.get_opcodes(): 861 if tag == 'replace': 862 g = self._fancy_replace(a, alo, ahi, b, blo, bhi) 863 elif tag == 'delete': 864 g = self._dump('-', a, alo, ahi) 865 elif tag == 'insert': 866 g = self._dump('+', b, blo, bhi) 867 elif tag == 'equal': 868 g = self._dump(' ', a, alo, ahi) 869 else: 870 raise ValueError('unknown tag %r' % (tag,)) 871 872 yield from g 873 874 def _dump(self, tag, x, lo, hi): 875 """Generate comparison results for a same-tagged range.""" 876 for i in range(lo, hi): 877 yield '%s %s' % (tag, x[i]) 878 879 def _plain_replace(self, a, alo, ahi, b, blo, bhi): 880 assert alo < ahi and blo < bhi 881 # dump the shorter block first -- reduces the burden on short-term 882 # memory if the blocks are of very different sizes 883 if bhi - blo < ahi - alo: 884 first = self._dump('+', b, blo, bhi) 885 second = self._dump('-', a, alo, ahi) 886 else: 887 first = self._dump('-', a, alo, ahi) 888 second = self._dump('+', b, blo, bhi) 889 890 for g in first, second: 891 yield from g 892 893 def _fancy_replace(self, a, alo, ahi, b, blo, bhi): 894 r""" 895 When replacing one block of lines with another, search the blocks 896 for *similar* lines; the best-matching pair (if any) is used as a 897 synch point, and intraline difference marking is done on the 898 similar pair. Lots of work, but often worth it. 899 900 Example: 901 902 >>> d = Differ() 903 >>> results = d._fancy_replace(['abcDefghiJkl\n'], 0, 1, 904 ... ['abcdefGhijkl\n'], 0, 1) 905 >>> print(''.join(results), end="") 906 - abcDefghiJkl 907 ? ^ ^ ^ 908 + abcdefGhijkl 909 ? ^ ^ ^ 910 """ 911 912 # don't synch up unless the lines have a similarity score of at 913 # least cutoff; best_ratio tracks the best score seen so far 914 best_ratio, cutoff = 0.74, 0.75 915 cruncher = SequenceMatcher(self.charjunk) 916 eqi, eqj = None, None # 1st indices of equal lines (if any) 917 918 # search for the pair that matches best without being identical 919 # (identical lines must be junk lines, & we don't want to synch up 920 # on junk -- unless we have to) 921 for j in range(blo, bhi): 922 bj = b[j] 923 cruncher.set_seq2(bj) 924 for i in range(alo, ahi): 925 ai = a[i] 926 if ai == bj: 927 if eqi is None: 928 eqi, eqj = i, j 929 continue 930 cruncher.set_seq1(ai) 931 # computing similarity is expensive, so use the quick 932 # upper bounds first -- have seen this speed up messy 933 # compares by a factor of 3. 934 # note that ratio() is only expensive to compute the first 935 # time it's called on a sequence pair; the expensive part 936 # of the computation is cached by cruncher 937 if cruncher.real_quick_ratio() > best_ratio and \ 938 cruncher.quick_ratio() > best_ratio and \ 939 cruncher.ratio() > best_ratio: 940 best_ratio, best_i, best_j = cruncher.ratio(), i, j 941 if best_ratio < cutoff: 942 # no non-identical "pretty close" pair 943 if eqi is None: 944 # no identical pair either -- treat it as a straight replace 945 yield from self._plain_replace(a, alo, ahi, b, blo, bhi) 946 return 947 # no close pair, but an identical pair -- synch up on that 948 best_i, best_j, best_ratio = eqi, eqj, 1.0 949 else: 950 # there's a close pair, so forget the identical pair (if any) 951 eqi = None 952 953 # a[best_i] very similar to b[best_j]; eqi is None iff they're not 954 # identical 955 956 # pump out diffs from before the synch point 957 yield from self._fancy_helper(a, alo, best_i, b, blo, best_j) 958 959 # do intraline marking on the synch pair 960 aelt, belt = a[best_i], b[best_j] 961 if eqi is None: 962 # pump out a '-', '?', '+', '?' quad for the synched lines 963 atags = btags = "" 964 cruncher.set_seqs(aelt, belt) 965 for tag, ai1, ai2, bj1, bj2 in cruncher.get_opcodes(): 966 la, lb = ai2 - ai1, bj2 - bj1 967 if tag == 'replace': 968 atags += '^' * la 969 btags += '^' * lb 970 elif tag == 'delete': 971 atags += '-' * la 972 elif tag == 'insert': 973 btags += '+' * lb 974 elif tag == 'equal': 975 atags += ' ' * la 976 btags += ' ' * lb 977 else: 978 raise ValueError('unknown tag %r' % (tag,)) 979 yield from self._qformat(aelt, belt, atags, btags) 980 else: 981 # the synch pair is identical 982 yield ' ' + aelt 983 984 # pump out diffs from after the synch point 985 yield from self._fancy_helper(a, best_i+1, ahi, b, best_j+1, bhi) 986 987 def _fancy_helper(self, a, alo, ahi, b, blo, bhi): 988 g = [] 989 if alo < ahi: 990 if blo < bhi: 991 g = self._fancy_replace(a, alo, ahi, b, blo, bhi) 992 else: 993 g = self._dump('-', a, alo, ahi) 994 elif blo < bhi: 995 g = self._dump('+', b, blo, bhi) 996 997 yield from g 998 999 def _qformat(self, aline, bline, atags, btags): 1000 r""" 1001 Format "?" output and deal with tabs. 1002 1003 Example: 1004 1005 >>> d = Differ() 1006 >>> results = d._qformat('\tabcDefghiJkl\n', '\tabcdefGhijkl\n', 1007 ... ' ^ ^ ^ ', ' ^ ^ ^ ') 1008 >>> for line in results: print(repr(line)) 1009 ... 1010 '- \tabcDefghiJkl\n' 1011 '? \t ^ ^ ^\n' 1012 '+ \tabcdefGhijkl\n' 1013 '? \t ^ ^ ^\n' 1014 """ 1015 atags = _keep_original_ws(aline, atags).rstrip() 1016 btags = _keep_original_ws(bline, btags).rstrip() 1017 1018 yield "- " + aline 1019 if atags: 1020 yield f"? {atags}\n" 1021 1022 yield "+ " + bline 1023 if btags: 1024 yield f"? {btags}\n" 1025 1026# With respect to junk, an earlier version of ndiff simply refused to 1027# *start* a match with a junk element. The result was cases like this: 1028# before: private Thread currentThread; 1029# after: private volatile Thread currentThread; 1030# If you consider whitespace to be junk, the longest contiguous match 1031# not starting with junk is "e Thread currentThread". So ndiff reported 1032# that "e volatil" was inserted between the 't' and the 'e' in "private". 1033# While an accurate view, to people that's absurd. The current version 1034# looks for matching blocks that are entirely junk-free, then extends the 1035# longest one of those as far as possible but only with matching junk. 1036# So now "currentThread" is matched, then extended to suck up the 1037# preceding blank; then "private" is matched, and extended to suck up the 1038# following blank; then "Thread" is matched; and finally ndiff reports 1039# that "volatile " was inserted before "Thread". The only quibble 1040# remaining is that perhaps it was really the case that " volatile" 1041# was inserted after "private". I can live with that <wink>. 1042 1043import re 1044 1045def IS_LINE_JUNK(line, pat=re.compile(r"\s*(?:#\s*)?$").match): 1046 r""" 1047 Return True for ignorable line: iff `line` is blank or contains a single '#'. 1048 1049 Examples: 1050 1051 >>> IS_LINE_JUNK('\n') 1052 True 1053 >>> IS_LINE_JUNK(' # \n') 1054 True 1055 >>> IS_LINE_JUNK('hello\n') 1056 False 1057 """ 1058 1059 return pat(line) is not None 1060 1061def IS_CHARACTER_JUNK(ch, ws=" \t"): 1062 r""" 1063 Return True for ignorable character: iff `ch` is a space or tab. 1064 1065 Examples: 1066 1067 >>> IS_CHARACTER_JUNK(' ') 1068 True 1069 >>> IS_CHARACTER_JUNK('\t') 1070 True 1071 >>> IS_CHARACTER_JUNK('\n') 1072 False 1073 >>> IS_CHARACTER_JUNK('x') 1074 False 1075 """ 1076 1077 return ch in ws 1078 1079 1080######################################################################## 1081### Unified Diff 1082######################################################################## 1083 1084def _format_range_unified(start, stop): 1085 'Convert range to the "ed" format' 1086 # Per the diff spec at http://www.unix.org/single_unix_specification/ 1087 beginning = start + 1 # lines start numbering with one 1088 length = stop - start 1089 if length == 1: 1090 return '{}'.format(beginning) 1091 if not length: 1092 beginning -= 1 # empty ranges begin at line just before the range 1093 return '{},{}'.format(beginning, length) 1094 1095def unified_diff(a, b, fromfile='', tofile='', fromfiledate='', 1096 tofiledate='', n=3, lineterm='\n'): 1097 r""" 1098 Compare two sequences of lines; generate the delta as a unified diff. 1099 1100 Unified diffs are a compact way of showing line changes and a few 1101 lines of context. The number of context lines is set by 'n' which 1102 defaults to three. 1103 1104 By default, the diff control lines (those with ---, +++, or @@) are 1105 created with a trailing newline. This is helpful so that inputs 1106 created from file.readlines() result in diffs that are suitable for 1107 file.writelines() since both the inputs and outputs have trailing 1108 newlines. 1109 1110 For inputs that do not have trailing newlines, set the lineterm 1111 argument to "" so that the output will be uniformly newline free. 1112 1113 The unidiff format normally has a header for filenames and modification 1114 times. Any or all of these may be specified using strings for 1115 'fromfile', 'tofile', 'fromfiledate', and 'tofiledate'. 1116 The modification times are normally expressed in the ISO 8601 format. 1117 1118 Example: 1119 1120 >>> for line in unified_diff('one two three four'.split(), 1121 ... 'zero one tree four'.split(), 'Original', 'Current', 1122 ... '2005-01-26 23:30:50', '2010-04-02 10:20:52', 1123 ... lineterm=''): 1124 ... print(line) # doctest: +NORMALIZE_WHITESPACE 1125 --- Original 2005-01-26 23:30:50 1126 +++ Current 2010-04-02 10:20:52 1127 @@ -1,4 +1,4 @@ 1128 +zero 1129 one 1130 -two 1131 -three 1132 +tree 1133 four 1134 """ 1135 1136 _check_types(a, b, fromfile, tofile, fromfiledate, tofiledate, lineterm) 1137 started = False 1138 for group in SequenceMatcher(None,a,b).get_grouped_opcodes(n): 1139 if not started: 1140 started = True 1141 fromdate = '\t{}'.format(fromfiledate) if fromfiledate else '' 1142 todate = '\t{}'.format(tofiledate) if tofiledate else '' 1143 yield '--- {}{}{}'.format(fromfile, fromdate, lineterm) 1144 yield '+++ {}{}{}'.format(tofile, todate, lineterm) 1145 1146 first, last = group[0], group[-1] 1147 file1_range = _format_range_unified(first[1], last[2]) 1148 file2_range = _format_range_unified(first[3], last[4]) 1149 yield '@@ -{} +{} @@{}'.format(file1_range, file2_range, lineterm) 1150 1151 for tag, i1, i2, j1, j2 in group: 1152 if tag == 'equal': 1153 for line in a[i1:i2]: 1154 yield ' ' + line 1155 continue 1156 if tag in {'replace', 'delete'}: 1157 for line in a[i1:i2]: 1158 yield '-' + line 1159 if tag in {'replace', 'insert'}: 1160 for line in b[j1:j2]: 1161 yield '+' + line 1162 1163 1164######################################################################## 1165### Context Diff 1166######################################################################## 1167 1168def _format_range_context(start, stop): 1169 'Convert range to the "ed" format' 1170 # Per the diff spec at http://www.unix.org/single_unix_specification/ 1171 beginning = start + 1 # lines start numbering with one 1172 length = stop - start 1173 if not length: 1174 beginning -= 1 # empty ranges begin at line just before the range 1175 if length <= 1: 1176 return '{}'.format(beginning) 1177 return '{},{}'.format(beginning, beginning + length - 1) 1178 1179# See http://www.unix.org/single_unix_specification/ 1180def context_diff(a, b, fromfile='', tofile='', 1181 fromfiledate='', tofiledate='', n=3, lineterm='\n'): 1182 r""" 1183 Compare two sequences of lines; generate the delta as a context diff. 1184 1185 Context diffs are a compact way of showing line changes and a few 1186 lines of context. The number of context lines is set by 'n' which 1187 defaults to three. 1188 1189 By default, the diff control lines (those with *** or ---) are 1190 created with a trailing newline. This is helpful so that inputs 1191 created from file.readlines() result in diffs that are suitable for 1192 file.writelines() since both the inputs and outputs have trailing 1193 newlines. 1194 1195 For inputs that do not have trailing newlines, set the lineterm 1196 argument to "" so that the output will be uniformly newline free. 1197 1198 The context diff format normally has a header for filenames and 1199 modification times. Any or all of these may be specified using 1200 strings for 'fromfile', 'tofile', 'fromfiledate', and 'tofiledate'. 1201 The modification times are normally expressed in the ISO 8601 format. 1202 If not specified, the strings default to blanks. 1203 1204 Example: 1205 1206 >>> print(''.join(context_diff('one\ntwo\nthree\nfour\n'.splitlines(True), 1207 ... 'zero\none\ntree\nfour\n'.splitlines(True), 'Original', 'Current')), 1208 ... end="") 1209 *** Original 1210 --- Current 1211 *************** 1212 *** 1,4 **** 1213 one 1214 ! two 1215 ! three 1216 four 1217 --- 1,4 ---- 1218 + zero 1219 one 1220 ! tree 1221 four 1222 """ 1223 1224 _check_types(a, b, fromfile, tofile, fromfiledate, tofiledate, lineterm) 1225 prefix = dict(insert='+ ', delete='- ', replace='! ', equal=' ') 1226 started = False 1227 for group in SequenceMatcher(None,a,b).get_grouped_opcodes(n): 1228 if not started: 1229 started = True 1230 fromdate = '\t{}'.format(fromfiledate) if fromfiledate else '' 1231 todate = '\t{}'.format(tofiledate) if tofiledate else '' 1232 yield '*** {}{}{}'.format(fromfile, fromdate, lineterm) 1233 yield '--- {}{}{}'.format(tofile, todate, lineterm) 1234 1235 first, last = group[0], group[-1] 1236 yield '***************' + lineterm 1237 1238 file1_range = _format_range_context(first[1], last[2]) 1239 yield '*** {} ****{}'.format(file1_range, lineterm) 1240 1241 if any(tag in {'replace', 'delete'} for tag, _, _, _, _ in group): 1242 for tag, i1, i2, _, _ in group: 1243 if tag != 'insert': 1244 for line in a[i1:i2]: 1245 yield prefix[tag] + line 1246 1247 file2_range = _format_range_context(first[3], last[4]) 1248 yield '--- {} ----{}'.format(file2_range, lineterm) 1249 1250 if any(tag in {'replace', 'insert'} for tag, _, _, _, _ in group): 1251 for tag, _, _, j1, j2 in group: 1252 if tag != 'delete': 1253 for line in b[j1:j2]: 1254 yield prefix[tag] + line 1255 1256def _check_types(a, b, *args): 1257 # Checking types is weird, but the alternative is garbled output when 1258 # someone passes mixed bytes and str to {unified,context}_diff(). E.g. 1259 # without this check, passing filenames as bytes results in output like 1260 # --- b'oldfile.txt' 1261 # +++ b'newfile.txt' 1262 # because of how str.format() incorporates bytes objects. 1263 if a and not isinstance(a[0], str): 1264 raise TypeError('lines to compare must be str, not %s (%r)' % 1265 (type(a[0]).__name__, a[0])) 1266 if b and not isinstance(b[0], str): 1267 raise TypeError('lines to compare must be str, not %s (%r)' % 1268 (type(b[0]).__name__, b[0])) 1269 for arg in args: 1270 if not isinstance(arg, str): 1271 raise TypeError('all arguments must be str, not: %r' % (arg,)) 1272 1273def diff_bytes(dfunc, a, b, fromfile=b'', tofile=b'', 1274 fromfiledate=b'', tofiledate=b'', n=3, lineterm=b'\n'): 1275 r""" 1276 Compare `a` and `b`, two sequences of lines represented as bytes rather 1277 than str. This is a wrapper for `dfunc`, which is typically either 1278 unified_diff() or context_diff(). Inputs are losslessly converted to 1279 strings so that `dfunc` only has to worry about strings, and encoded 1280 back to bytes on return. This is necessary to compare files with 1281 unknown or inconsistent encoding. All other inputs (except `n`) must be 1282 bytes rather than str. 1283 """ 1284 def decode(s): 1285 try: 1286 return s.decode('ascii', 'surrogateescape') 1287 except AttributeError as err: 1288 msg = ('all arguments must be bytes, not %s (%r)' % 1289 (type(s).__name__, s)) 1290 raise TypeError(msg) from err 1291 a = list(map(decode, a)) 1292 b = list(map(decode, b)) 1293 fromfile = decode(fromfile) 1294 tofile = decode(tofile) 1295 fromfiledate = decode(fromfiledate) 1296 tofiledate = decode(tofiledate) 1297 lineterm = decode(lineterm) 1298 1299 lines = dfunc(a, b, fromfile, tofile, fromfiledate, tofiledate, n, lineterm) 1300 for line in lines: 1301 yield line.encode('ascii', 'surrogateescape') 1302 1303def ndiff(a, b, linejunk=None, charjunk=IS_CHARACTER_JUNK): 1304 r""" 1305 Compare `a` and `b` (lists of strings); return a `Differ`-style delta. 1306 1307 Optional keyword parameters `linejunk` and `charjunk` are for filter 1308 functions, or can be None: 1309 1310 - linejunk: A function that should accept a single string argument and 1311 return true iff the string is junk. The default is None, and is 1312 recommended; the underlying SequenceMatcher class has an adaptive 1313 notion of "noise" lines. 1314 1315 - charjunk: A function that accepts a character (string of length 1316 1), and returns true iff the character is junk. The default is 1317 the module-level function IS_CHARACTER_JUNK, which filters out 1318 whitespace characters (a blank or tab; note: it's a bad idea to 1319 include newline in this!). 1320 1321 Tools/scripts/ndiff.py is a command-line front-end to this function. 1322 1323 Example: 1324 1325 >>> diff = ndiff('one\ntwo\nthree\n'.splitlines(keepends=True), 1326 ... 'ore\ntree\nemu\n'.splitlines(keepends=True)) 1327 >>> print(''.join(diff), end="") 1328 - one 1329 ? ^ 1330 + ore 1331 ? ^ 1332 - two 1333 - three 1334 ? - 1335 + tree 1336 + emu 1337 """ 1338 return Differ(linejunk, charjunk).compare(a, b) 1339 1340def _mdiff(fromlines, tolines, context=None, linejunk=None, 1341 charjunk=IS_CHARACTER_JUNK): 1342 r"""Returns generator yielding marked up from/to side by side differences. 1343 1344 Arguments: 1345 fromlines -- list of text lines to compared to tolines 1346 tolines -- list of text lines to be compared to fromlines 1347 context -- number of context lines to display on each side of difference, 1348 if None, all from/to text lines will be generated. 1349 linejunk -- passed on to ndiff (see ndiff documentation) 1350 charjunk -- passed on to ndiff (see ndiff documentation) 1351 1352 This function returns an iterator which returns a tuple: 1353 (from line tuple, to line tuple, boolean flag) 1354 1355 from/to line tuple -- (line num, line text) 1356 line num -- integer or None (to indicate a context separation) 1357 line text -- original line text with following markers inserted: 1358 '\0+' -- marks start of added text 1359 '\0-' -- marks start of deleted text 1360 '\0^' -- marks start of changed text 1361 '\1' -- marks end of added/deleted/changed text 1362 1363 boolean flag -- None indicates context separation, True indicates 1364 either "from" or "to" line contains a change, otherwise False. 1365 1366 This function/iterator was originally developed to generate side by side 1367 file difference for making HTML pages (see HtmlDiff class for example 1368 usage). 1369 1370 Note, this function utilizes the ndiff function to generate the side by 1371 side difference markup. Optional ndiff arguments may be passed to this 1372 function and they in turn will be passed to ndiff. 1373 """ 1374 import re 1375 1376 # regular expression for finding intraline change indices 1377 change_re = re.compile(r'(\++|\-+|\^+)') 1378 1379 # create the difference iterator to generate the differences 1380 diff_lines_iterator = ndiff(fromlines,tolines,linejunk,charjunk) 1381 1382 def _make_line(lines, format_key, side, num_lines=[0,0]): 1383 """Returns line of text with user's change markup and line formatting. 1384 1385 lines -- list of lines from the ndiff generator to produce a line of 1386 text from. When producing the line of text to return, the 1387 lines used are removed from this list. 1388 format_key -- '+' return first line in list with "add" markup around 1389 the entire line. 1390 '-' return first line in list with "delete" markup around 1391 the entire line. 1392 '?' return first line in list with add/delete/change 1393 intraline markup (indices obtained from second line) 1394 None return first line in list with no markup 1395 side -- indice into the num_lines list (0=from,1=to) 1396 num_lines -- from/to current line number. This is NOT intended to be a 1397 passed parameter. It is present as a keyword argument to 1398 maintain memory of the current line numbers between calls 1399 of this function. 1400 1401 Note, this function is purposefully not defined at the module scope so 1402 that data it needs from its parent function (within whose context it 1403 is defined) does not need to be of module scope. 1404 """ 1405 num_lines[side] += 1 1406 # Handle case where no user markup is to be added, just return line of 1407 # text with user's line format to allow for usage of the line number. 1408 if format_key is None: 1409 return (num_lines[side],lines.pop(0)[2:]) 1410 # Handle case of intraline changes 1411 if format_key == '?': 1412 text, markers = lines.pop(0), lines.pop(0) 1413 # find intraline changes (store change type and indices in tuples) 1414 sub_info = [] 1415 def record_sub_info(match_object,sub_info=sub_info): 1416 sub_info.append([match_object.group(1)[0],match_object.span()]) 1417 return match_object.group(1) 1418 change_re.sub(record_sub_info,markers) 1419 # process each tuple inserting our special marks that won't be 1420 # noticed by an xml/html escaper. 1421 for key,(begin,end) in reversed(sub_info): 1422 text = text[0:begin]+'\0'+key+text[begin:end]+'\1'+text[end:] 1423 text = text[2:] 1424 # Handle case of add/delete entire line 1425 else: 1426 text = lines.pop(0)[2:] 1427 # if line of text is just a newline, insert a space so there is 1428 # something for the user to highlight and see. 1429 if not text: 1430 text = ' ' 1431 # insert marks that won't be noticed by an xml/html escaper. 1432 text = '\0' + format_key + text + '\1' 1433 # Return line of text, first allow user's line formatter to do its 1434 # thing (such as adding the line number) then replace the special 1435 # marks with what the user's change markup. 1436 return (num_lines[side],text) 1437 1438 def _line_iterator(): 1439 """Yields from/to lines of text with a change indication. 1440 1441 This function is an iterator. It itself pulls lines from a 1442 differencing iterator, processes them and yields them. When it can 1443 it yields both a "from" and a "to" line, otherwise it will yield one 1444 or the other. In addition to yielding the lines of from/to text, a 1445 boolean flag is yielded to indicate if the text line(s) have 1446 differences in them. 1447 1448 Note, this function is purposefully not defined at the module scope so 1449 that data it needs from its parent function (within whose context it 1450 is defined) does not need to be of module scope. 1451 """ 1452 lines = [] 1453 num_blanks_pending, num_blanks_to_yield = 0, 0 1454 while True: 1455 # Load up next 4 lines so we can look ahead, create strings which 1456 # are a concatenation of the first character of each of the 4 lines 1457 # so we can do some very readable comparisons. 1458 while len(lines) < 4: 1459 lines.append(next(diff_lines_iterator, 'X')) 1460 s = ''.join([line[0] for line in lines]) 1461 if s.startswith('X'): 1462 # When no more lines, pump out any remaining blank lines so the 1463 # corresponding add/delete lines get a matching blank line so 1464 # all line pairs get yielded at the next level. 1465 num_blanks_to_yield = num_blanks_pending 1466 elif s.startswith('-?+?'): 1467 # simple intraline change 1468 yield _make_line(lines,'?',0), _make_line(lines,'?',1), True 1469 continue 1470 elif s.startswith('--++'): 1471 # in delete block, add block coming: we do NOT want to get 1472 # caught up on blank lines yet, just process the delete line 1473 num_blanks_pending -= 1 1474 yield _make_line(lines,'-',0), None, True 1475 continue 1476 elif s.startswith(('--?+', '--+', '- ')): 1477 # in delete block and see an intraline change or unchanged line 1478 # coming: yield the delete line and then blanks 1479 from_line,to_line = _make_line(lines,'-',0), None 1480 num_blanks_to_yield,num_blanks_pending = num_blanks_pending-1,0 1481 elif s.startswith('-+?'): 1482 # intraline change 1483 yield _make_line(lines,None,0), _make_line(lines,'?',1), True 1484 continue 1485 elif s.startswith('-?+'): 1486 # intraline change 1487 yield _make_line(lines,'?',0), _make_line(lines,None,1), True 1488 continue 1489 elif s.startswith('-'): 1490 # delete FROM line 1491 num_blanks_pending -= 1 1492 yield _make_line(lines,'-',0), None, True 1493 continue 1494 elif s.startswith('+--'): 1495 # in add block, delete block coming: we do NOT want to get 1496 # caught up on blank lines yet, just process the add line 1497 num_blanks_pending += 1 1498 yield None, _make_line(lines,'+',1), True 1499 continue 1500 elif s.startswith(('+ ', '+-')): 1501 # will be leaving an add block: yield blanks then add line 1502 from_line, to_line = None, _make_line(lines,'+',1) 1503 num_blanks_to_yield,num_blanks_pending = num_blanks_pending+1,0 1504 elif s.startswith('+'): 1505 # inside an add block, yield the add line 1506 num_blanks_pending += 1 1507 yield None, _make_line(lines,'+',1), True 1508 continue 1509 elif s.startswith(' '): 1510 # unchanged text, yield it to both sides 1511 yield _make_line(lines[:],None,0),_make_line(lines,None,1),False 1512 continue 1513 # Catch up on the blank lines so when we yield the next from/to 1514 # pair, they are lined up. 1515 while(num_blanks_to_yield < 0): 1516 num_blanks_to_yield += 1 1517 yield None,('','\n'),True 1518 while(num_blanks_to_yield > 0): 1519 num_blanks_to_yield -= 1 1520 yield ('','\n'),None,True 1521 if s.startswith('X'): 1522 return 1523 else: 1524 yield from_line,to_line,True 1525 1526 def _line_pair_iterator(): 1527 """Yields from/to lines of text with a change indication. 1528 1529 This function is an iterator. It itself pulls lines from the line 1530 iterator. Its difference from that iterator is that this function 1531 always yields a pair of from/to text lines (with the change 1532 indication). If necessary it will collect single from/to lines 1533 until it has a matching pair from/to pair to yield. 1534 1535 Note, this function is purposefully not defined at the module scope so 1536 that data it needs from its parent function (within whose context it 1537 is defined) does not need to be of module scope. 1538 """ 1539 line_iterator = _line_iterator() 1540 fromlines,tolines=[],[] 1541 while True: 1542 # Collecting lines of text until we have a from/to pair 1543 while (len(fromlines)==0 or len(tolines)==0): 1544 try: 1545 from_line, to_line, found_diff = next(line_iterator) 1546 except StopIteration: 1547 return 1548 if from_line is not None: 1549 fromlines.append((from_line,found_diff)) 1550 if to_line is not None: 1551 tolines.append((to_line,found_diff)) 1552 # Once we have a pair, remove them from the collection and yield it 1553 from_line, fromDiff = fromlines.pop(0) 1554 to_line, to_diff = tolines.pop(0) 1555 yield (from_line,to_line,fromDiff or to_diff) 1556 1557 # Handle case where user does not want context differencing, just yield 1558 # them up without doing anything else with them. 1559 line_pair_iterator = _line_pair_iterator() 1560 if context is None: 1561 yield from line_pair_iterator 1562 # Handle case where user wants context differencing. We must do some 1563 # storage of lines until we know for sure that they are to be yielded. 1564 else: 1565 context += 1 1566 lines_to_write = 0 1567 while True: 1568 # Store lines up until we find a difference, note use of a 1569 # circular queue because we only need to keep around what 1570 # we need for context. 1571 index, contextLines = 0, [None]*(context) 1572 found_diff = False 1573 while(found_diff is False): 1574 try: 1575 from_line, to_line, found_diff = next(line_pair_iterator) 1576 except StopIteration: 1577 return 1578 i = index % context 1579 contextLines[i] = (from_line, to_line, found_diff) 1580 index += 1 1581 # Yield lines that we have collected so far, but first yield 1582 # the user's separator. 1583 if index > context: 1584 yield None, None, None 1585 lines_to_write = context 1586 else: 1587 lines_to_write = index 1588 index = 0 1589 while(lines_to_write): 1590 i = index % context 1591 index += 1 1592 yield contextLines[i] 1593 lines_to_write -= 1 1594 # Now yield the context lines after the change 1595 lines_to_write = context-1 1596 try: 1597 while(lines_to_write): 1598 from_line, to_line, found_diff = next(line_pair_iterator) 1599 # If another change within the context, extend the context 1600 if found_diff: 1601 lines_to_write = context-1 1602 else: 1603 lines_to_write -= 1 1604 yield from_line, to_line, found_diff 1605 except StopIteration: 1606 # Catch exception from next() and return normally 1607 return 1608 1609 1610_file_template = """ 1611<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" 1612 "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> 1613 1614<html> 1615 1616<head> 1617 <meta http-equiv="Content-Type" 1618 content="text/html; charset=%(charset)s" /> 1619 <title></title> 1620 <style type="text/css">%(styles)s 1621 </style> 1622</head> 1623 1624<body> 1625 %(table)s%(legend)s 1626</body> 1627 1628</html>""" 1629 1630_styles = """ 1631 table.diff {font-family:Courier; border:medium;} 1632 .diff_header {background-color:#e0e0e0} 1633 td.diff_header {text-align:right} 1634 .diff_next {background-color:#c0c0c0} 1635 .diff_add {background-color:#aaffaa} 1636 .diff_chg {background-color:#ffff77} 1637 .diff_sub {background-color:#ffaaaa}""" 1638 1639_table_template = """ 1640 <table class="diff" id="difflib_chg_%(prefix)s_top" 1641 cellspacing="0" cellpadding="0" rules="groups" > 1642 <colgroup></colgroup> <colgroup></colgroup> <colgroup></colgroup> 1643 <colgroup></colgroup> <colgroup></colgroup> <colgroup></colgroup> 1644 %(header_row)s 1645 <tbody> 1646%(data_rows)s </tbody> 1647 </table>""" 1648 1649_legend = """ 1650 <table class="diff" summary="Legends"> 1651 <tr> <th colspan="2"> Legends </th> </tr> 1652 <tr> <td> <table border="" summary="Colors"> 1653 <tr><th> Colors </th> </tr> 1654 <tr><td class="diff_add"> Added </td></tr> 1655 <tr><td class="diff_chg">Changed</td> </tr> 1656 <tr><td class="diff_sub">Deleted</td> </tr> 1657 </table></td> 1658 <td> <table border="" summary="Links"> 1659 <tr><th colspan="2"> Links </th> </tr> 1660 <tr><td>(f)irst change</td> </tr> 1661 <tr><td>(n)ext change</td> </tr> 1662 <tr><td>(t)op</td> </tr> 1663 </table></td> </tr> 1664 </table>""" 1665 1666class HtmlDiff(object): 1667 """For producing HTML side by side comparison with change highlights. 1668 1669 This class can be used to create an HTML table (or a complete HTML file 1670 containing the table) showing a side by side, line by line comparison 1671 of text with inter-line and intra-line change highlights. The table can 1672 be generated in either full or contextual difference mode. 1673 1674 The following methods are provided for HTML generation: 1675 1676 make_table -- generates HTML for a single side by side table 1677 make_file -- generates complete HTML file with a single side by side table 1678 1679 See tools/scripts/diff.py for an example usage of this class. 1680 """ 1681 1682 _file_template = _file_template 1683 _styles = _styles 1684 _table_template = _table_template 1685 _legend = _legend 1686 _default_prefix = 0 1687 1688 def __init__(self,tabsize=8,wrapcolumn=None,linejunk=None, 1689 charjunk=IS_CHARACTER_JUNK): 1690 """HtmlDiff instance initializer 1691 1692 Arguments: 1693 tabsize -- tab stop spacing, defaults to 8. 1694 wrapcolumn -- column number where lines are broken and wrapped, 1695 defaults to None where lines are not wrapped. 1696 linejunk,charjunk -- keyword arguments passed into ndiff() (used by 1697 HtmlDiff() to generate the side by side HTML differences). See 1698 ndiff() documentation for argument default values and descriptions. 1699 """ 1700 self._tabsize = tabsize 1701 self._wrapcolumn = wrapcolumn 1702 self._linejunk = linejunk 1703 self._charjunk = charjunk 1704 1705 def make_file(self, fromlines, tolines, fromdesc='', todesc='', 1706 context=False, numlines=5, *, charset='utf-8'): 1707 """Returns HTML file of side by side comparison with change highlights 1708 1709 Arguments: 1710 fromlines -- list of "from" lines 1711 tolines -- list of "to" lines 1712 fromdesc -- "from" file column header string 1713 todesc -- "to" file column header string 1714 context -- set to True for contextual differences (defaults to False 1715 which shows full differences). 1716 numlines -- number of context lines. When context is set True, 1717 controls number of lines displayed before and after the change. 1718 When context is False, controls the number of lines to place 1719 the "next" link anchors before the next change (so click of 1720 "next" link jumps to just before the change). 1721 charset -- charset of the HTML document 1722 """ 1723 1724 return (self._file_template % dict( 1725 styles=self._styles, 1726 legend=self._legend, 1727 table=self.make_table(fromlines, tolines, fromdesc, todesc, 1728 context=context, numlines=numlines), 1729 charset=charset 1730 )).encode(charset, 'xmlcharrefreplace').decode(charset) 1731 1732 def _tab_newline_replace(self,fromlines,tolines): 1733 """Returns from/to line lists with tabs expanded and newlines removed. 1734 1735 Instead of tab characters being replaced by the number of spaces 1736 needed to fill in to the next tab stop, this function will fill 1737 the space with tab characters. This is done so that the difference 1738 algorithms can identify changes in a file when tabs are replaced by 1739 spaces and vice versa. At the end of the HTML generation, the tab 1740 characters will be replaced with a nonbreakable space. 1741 """ 1742 def expand_tabs(line): 1743 # hide real spaces 1744 line = line.replace(' ','\0') 1745 # expand tabs into spaces 1746 line = line.expandtabs(self._tabsize) 1747 # replace spaces from expanded tabs back into tab characters 1748 # (we'll replace them with markup after we do differencing) 1749 line = line.replace(' ','\t') 1750 return line.replace('\0',' ').rstrip('\n') 1751 fromlines = [expand_tabs(line) for line in fromlines] 1752 tolines = [expand_tabs(line) for line in tolines] 1753 return fromlines,tolines 1754 1755 def _split_line(self,data_list,line_num,text): 1756 """Builds list of text lines by splitting text lines at wrap point 1757 1758 This function will determine if the input text line needs to be 1759 wrapped (split) into separate lines. If so, the first wrap point 1760 will be determined and the first line appended to the output 1761 text line list. This function is used recursively to handle 1762 the second part of the split line to further split it. 1763 """ 1764 # if blank line or context separator, just add it to the output list 1765 if not line_num: 1766 data_list.append((line_num,text)) 1767 return 1768 1769 # if line text doesn't need wrapping, just add it to the output list 1770 size = len(text) 1771 max = self._wrapcolumn 1772 if (size <= max) or ((size -(text.count('\0')*3)) <= max): 1773 data_list.append((line_num,text)) 1774 return 1775 1776 # scan text looking for the wrap point, keeping track if the wrap 1777 # point is inside markers 1778 i = 0 1779 n = 0 1780 mark = '' 1781 while n < max and i < size: 1782 if text[i] == '\0': 1783 i += 1 1784 mark = text[i] 1785 i += 1 1786 elif text[i] == '\1': 1787 i += 1 1788 mark = '' 1789 else: 1790 i += 1 1791 n += 1 1792 1793 # wrap point is inside text, break it up into separate lines 1794 line1 = text[:i] 1795 line2 = text[i:] 1796 1797 # if wrap point is inside markers, place end marker at end of first 1798 # line and start marker at beginning of second line because each 1799 # line will have its own table tag markup around it. 1800 if mark: 1801 line1 = line1 + '\1' 1802 line2 = '\0' + mark + line2 1803 1804 # tack on first line onto the output list 1805 data_list.append((line_num,line1)) 1806 1807 # use this routine again to wrap the remaining text 1808 self._split_line(data_list,'>',line2) 1809 1810 def _line_wrapper(self,diffs): 1811 """Returns iterator that splits (wraps) mdiff text lines""" 1812 1813 # pull from/to data and flags from mdiff iterator 1814 for fromdata,todata,flag in diffs: 1815 # check for context separators and pass them through 1816 if flag is None: 1817 yield fromdata,todata,flag 1818 continue 1819 (fromline,fromtext),(toline,totext) = fromdata,todata 1820 # for each from/to line split it at the wrap column to form 1821 # list of text lines. 1822 fromlist,tolist = [],[] 1823 self._split_line(fromlist,fromline,fromtext) 1824 self._split_line(tolist,toline,totext) 1825 # yield from/to line in pairs inserting blank lines as 1826 # necessary when one side has more wrapped lines 1827 while fromlist or tolist: 1828 if fromlist: 1829 fromdata = fromlist.pop(0) 1830 else: 1831 fromdata = ('',' ') 1832 if tolist: 1833 todata = tolist.pop(0) 1834 else: 1835 todata = ('',' ') 1836 yield fromdata,todata,flag 1837 1838 def _collect_lines(self,diffs): 1839 """Collects mdiff output into separate lists 1840 1841 Before storing the mdiff from/to data into a list, it is converted 1842 into a single line of text with HTML markup. 1843 """ 1844 1845 fromlist,tolist,flaglist = [],[],[] 1846 # pull from/to data and flags from mdiff style iterator 1847 for fromdata,todata,flag in diffs: 1848 try: 1849 # store HTML markup of the lines into the lists 1850 fromlist.append(self._format_line(0,flag,*fromdata)) 1851 tolist.append(self._format_line(1,flag,*todata)) 1852 except TypeError: 1853 # exceptions occur for lines where context separators go 1854 fromlist.append(None) 1855 tolist.append(None) 1856 flaglist.append(flag) 1857 return fromlist,tolist,flaglist 1858 1859 def _format_line(self,side,flag,linenum,text): 1860 """Returns HTML markup of "from" / "to" text lines 1861 1862 side -- 0 or 1 indicating "from" or "to" text 1863 flag -- indicates if difference on line 1864 linenum -- line number (used for line number column) 1865 text -- line text to be marked up 1866 """ 1867 try: 1868 linenum = '%d' % linenum 1869 id = ' id="%s%s"' % (self._prefix[side],linenum) 1870 except TypeError: 1871 # handle blank lines where linenum is '>' or '' 1872 id = '' 1873 # replace those things that would get confused with HTML symbols 1874 text=text.replace("&","&").replace(">",">").replace("<","<") 1875 1876 # make space non-breakable so they don't get compressed or line wrapped 1877 text = text.replace(' ',' ').rstrip() 1878 1879 return '<td class="diff_header"%s>%s</td><td nowrap="nowrap">%s</td>' \ 1880 % (id,linenum,text) 1881 1882 def _make_prefix(self): 1883 """Create unique anchor prefixes""" 1884 1885 # Generate a unique anchor prefix so multiple tables 1886 # can exist on the same HTML page without conflicts. 1887 fromprefix = "from%d_" % HtmlDiff._default_prefix 1888 toprefix = "to%d_" % HtmlDiff._default_prefix 1889 HtmlDiff._default_prefix += 1 1890 # store prefixes so line format method has access 1891 self._prefix = [fromprefix,toprefix] 1892 1893 def _convert_flags(self,fromlist,tolist,flaglist,context,numlines): 1894 """Makes list of "next" links""" 1895 1896 # all anchor names will be generated using the unique "to" prefix 1897 toprefix = self._prefix[1] 1898 1899 # process change flags, generating middle column of next anchors/links 1900 next_id = ['']*len(flaglist) 1901 next_href = ['']*len(flaglist) 1902 num_chg, in_change = 0, False 1903 last = 0 1904 for i,flag in enumerate(flaglist): 1905 if flag: 1906 if not in_change: 1907 in_change = True 1908 last = i 1909 # at the beginning of a change, drop an anchor a few lines 1910 # (the context lines) before the change for the previous 1911 # link 1912 i = max([0,i-numlines]) 1913 next_id[i] = ' id="difflib_chg_%s_%d"' % (toprefix,num_chg) 1914 # at the beginning of a change, drop a link to the next 1915 # change 1916 num_chg += 1 1917 next_href[last] = '<a href="#difflib_chg_%s_%d">n</a>' % ( 1918 toprefix,num_chg) 1919 else: 1920 in_change = False 1921 # check for cases where there is no content to avoid exceptions 1922 if not flaglist: 1923 flaglist = [False] 1924 next_id = [''] 1925 next_href = [''] 1926 last = 0 1927 if context: 1928 fromlist = ['<td></td><td> No Differences Found </td>'] 1929 tolist = fromlist 1930 else: 1931 fromlist = tolist = ['<td></td><td> Empty File </td>'] 1932 # if not a change on first line, drop a link 1933 if not flaglist[0]: 1934 next_href[0] = '<a href="#difflib_chg_%s_0">f</a>' % toprefix 1935 # redo the last link to link to the top 1936 next_href[last] = '<a href="#difflib_chg_%s_top">t</a>' % (toprefix) 1937 1938 return fromlist,tolist,flaglist,next_href,next_id 1939 1940 def make_table(self,fromlines,tolines,fromdesc='',todesc='',context=False, 1941 numlines=5): 1942 """Returns HTML table of side by side comparison with change highlights 1943 1944 Arguments: 1945 fromlines -- list of "from" lines 1946 tolines -- list of "to" lines 1947 fromdesc -- "from" file column header string 1948 todesc -- "to" file column header string 1949 context -- set to True for contextual differences (defaults to False 1950 which shows full differences). 1951 numlines -- number of context lines. When context is set True, 1952 controls number of lines displayed before and after the change. 1953 When context is False, controls the number of lines to place 1954 the "next" link anchors before the next change (so click of 1955 "next" link jumps to just before the change). 1956 """ 1957 1958 # make unique anchor prefixes so that multiple tables may exist 1959 # on the same page without conflict. 1960 self._make_prefix() 1961 1962 # change tabs to spaces before it gets more difficult after we insert 1963 # markup 1964 fromlines,tolines = self._tab_newline_replace(fromlines,tolines) 1965 1966 # create diffs iterator which generates side by side from/to data 1967 if context: 1968 context_lines = numlines 1969 else: 1970 context_lines = None 1971 diffs = _mdiff(fromlines,tolines,context_lines,linejunk=self._linejunk, 1972 charjunk=self._charjunk) 1973 1974 # set up iterator to wrap lines that exceed desired width 1975 if self._wrapcolumn: 1976 diffs = self._line_wrapper(diffs) 1977 1978 # collect up from/to lines and flags into lists (also format the lines) 1979 fromlist,tolist,flaglist = self._collect_lines(diffs) 1980 1981 # process change flags, generating middle column of next anchors/links 1982 fromlist,tolist,flaglist,next_href,next_id = self._convert_flags( 1983 fromlist,tolist,flaglist,context,numlines) 1984 1985 s = [] 1986 fmt = ' <tr><td class="diff_next"%s>%s</td>%s' + \ 1987 '<td class="diff_next">%s</td>%s</tr>\n' 1988 for i in range(len(flaglist)): 1989 if flaglist[i] is None: 1990 # mdiff yields None on separator lines skip the bogus ones 1991 # generated for the first line 1992 if i > 0: 1993 s.append(' </tbody> \n <tbody>\n') 1994 else: 1995 s.append( fmt % (next_id[i],next_href[i],fromlist[i], 1996 next_href[i],tolist[i])) 1997 if fromdesc or todesc: 1998 header_row = '<thead><tr>%s%s%s%s</tr></thead>' % ( 1999 '<th class="diff_next"><br /></th>', 2000 '<th colspan="2" class="diff_header">%s</th>' % fromdesc, 2001 '<th class="diff_next"><br /></th>', 2002 '<th colspan="2" class="diff_header">%s</th>' % todesc) 2003 else: 2004 header_row = '' 2005 2006 table = self._table_template % dict( 2007 data_rows=''.join(s), 2008 header_row=header_row, 2009 prefix=self._prefix[1]) 2010 2011 return table.replace('\0+','<span class="diff_add">'). \ 2012 replace('\0-','<span class="diff_sub">'). \ 2013 replace('\0^','<span class="diff_chg">'). \ 2014 replace('\1','</span>'). \ 2015 replace('\t',' ') 2016 2017del re 2018 2019def restore(delta, which): 2020 r""" 2021 Generate one of the two sequences that generated a delta. 2022 2023 Given a `delta` produced by `Differ.compare()` or `ndiff()`, extract 2024 lines originating from file 1 or 2 (parameter `which`), stripping off line 2025 prefixes. 2026 2027 Examples: 2028 2029 >>> diff = ndiff('one\ntwo\nthree\n'.splitlines(keepends=True), 2030 ... 'ore\ntree\nemu\n'.splitlines(keepends=True)) 2031 >>> diff = list(diff) 2032 >>> print(''.join(restore(diff, 1)), end="") 2033 one 2034 two 2035 three 2036 >>> print(''.join(restore(diff, 2)), end="") 2037 ore 2038 tree 2039 emu 2040 """ 2041 try: 2042 tag = {1: "- ", 2: "+ "}[int(which)] 2043 except KeyError: 2044 raise ValueError('unknown delta choice (must be 1 or 2): %r' 2045 % which) from None 2046 prefixes = (" ", tag) 2047 for line in delta: 2048 if line[:2] in prefixes: 2049 yield line[2:] 2050 2051def _test(): 2052 import doctest, difflib 2053 return doctest.testmod(difflib) 2054 2055if __name__ == "__main__": 2056 _test() 2057