• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1"""
2Module difflib -- helpers for computing deltas between objects.
3
4Function get_close_matches(word, possibilities, n=3, cutoff=0.6):
5    Use SequenceMatcher to return list of the best "good enough" matches.
6
7Function context_diff(a, b):
8    For two lists of strings, return a delta in context diff format.
9
10Function ndiff(a, b):
11    Return a delta: the difference between `a` and `b` (lists of strings).
12
13Function restore(delta, which):
14    Return one of the two sequences that generated an ndiff delta.
15
16Function unified_diff(a, b):
17    For two lists of strings, return a delta in unified diff format.
18
19Class SequenceMatcher:
20    A flexible class for comparing pairs of sequences of any type.
21
22Class Differ:
23    For producing human-readable deltas from sequences of lines of text.
24
25Class HtmlDiff:
26    For producing HTML side by side comparison with change highlights.
27"""
28
29__all__ = ['get_close_matches', 'ndiff', 'restore', 'SequenceMatcher',
30           'Differ','IS_CHARACTER_JUNK', 'IS_LINE_JUNK', 'context_diff',
31           'unified_diff', 'diff_bytes', 'HtmlDiff', 'Match']
32
33from heapq import nlargest as _nlargest
34from collections import namedtuple as _namedtuple
35from types import GenericAlias
36
37Match = _namedtuple('Match', 'a b size')
38
39def _calculate_ratio(matches, length):
40    if length:
41        return 2.0 * matches / length
42    return 1.0
43
44class SequenceMatcher:
45
46    """
47    SequenceMatcher is a flexible class for comparing pairs of sequences of
48    any type, so long as the sequence elements are hashable.  The basic
49    algorithm predates, and is a little fancier than, an algorithm
50    published in the late 1980's by Ratcliff and Obershelp under the
51    hyperbolic name "gestalt pattern matching".  The basic idea is to find
52    the longest contiguous matching subsequence that contains no "junk"
53    elements (R-O doesn't address junk).  The same idea is then applied
54    recursively to the pieces of the sequences to the left and to the right
55    of the matching subsequence.  This does not yield minimal edit
56    sequences, but does tend to yield matches that "look right" to people.
57
58    SequenceMatcher tries to compute a "human-friendly diff" between two
59    sequences.  Unlike e.g. UNIX(tm) diff, the fundamental notion is the
60    longest *contiguous* & junk-free matching subsequence.  That's what
61    catches peoples' eyes.  The Windows(tm) windiff has another interesting
62    notion, pairing up elements that appear uniquely in each sequence.
63    That, and the method here, appear to yield more intuitive difference
64    reports than does diff.  This method appears to be the least vulnerable
65    to synching up on blocks of "junk lines", though (like blank lines in
66    ordinary text files, or maybe "<P>" lines in HTML files).  That may be
67    because this is the only method of the 3 that has a *concept* of
68    "junk" <wink>.
69
70    Example, comparing two strings, and considering blanks to be "junk":
71
72    >>> s = SequenceMatcher(lambda x: x == " ",
73    ...                     "private Thread currentThread;",
74    ...                     "private volatile Thread currentThread;")
75    >>>
76
77    .ratio() returns a float in [0, 1], measuring the "similarity" of the
78    sequences.  As a rule of thumb, a .ratio() value over 0.6 means the
79    sequences are close matches:
80
81    >>> print(round(s.ratio(), 3))
82    0.866
83    >>>
84
85    If you're only interested in where the sequences match,
86    .get_matching_blocks() is handy:
87
88    >>> for block in s.get_matching_blocks():
89    ...     print("a[%d] and b[%d] match for %d elements" % block)
90    a[0] and b[0] match for 8 elements
91    a[8] and b[17] match for 21 elements
92    a[29] and b[38] match for 0 elements
93
94    Note that the last tuple returned by .get_matching_blocks() is always a
95    dummy, (len(a), len(b), 0), and this is the only case in which the last
96    tuple element (number of elements matched) is 0.
97
98    If you want to know how to change the first sequence into the second,
99    use .get_opcodes():
100
101    >>> for opcode in s.get_opcodes():
102    ...     print("%6s a[%d:%d] b[%d:%d]" % opcode)
103     equal a[0:8] b[0:8]
104    insert a[8:8] b[8:17]
105     equal a[8:29] b[17:38]
106
107    See the Differ class for a fancy human-friendly file differencer, which
108    uses SequenceMatcher both to compare sequences of lines, and to compare
109    sequences of characters within similar (near-matching) lines.
110
111    See also function get_close_matches() in this module, which shows how
112    simple code building on SequenceMatcher can be used to do useful work.
113
114    Timing:  Basic R-O is cubic time worst case and quadratic time expected
115    case.  SequenceMatcher is quadratic time for the worst case and has
116    expected-case behavior dependent in a complicated way on how many
117    elements the sequences have in common; best case time is linear.
118
119    Methods:
120
121    __init__(isjunk=None, a='', b='')
122        Construct a SequenceMatcher.
123
124    set_seqs(a, b)
125        Set the two sequences to be compared.
126
127    set_seq1(a)
128        Set the first sequence to be compared.
129
130    set_seq2(b)
131        Set the second sequence to be compared.
132
133    find_longest_match(alo=0, ahi=None, blo=0, bhi=None)
134        Find longest matching block in a[alo:ahi] and b[blo:bhi].
135
136    get_matching_blocks()
137        Return list of triples describing matching subsequences.
138
139    get_opcodes()
140        Return list of 5-tuples describing how to turn a into b.
141
142    ratio()
143        Return a measure of the sequences' similarity (float in [0,1]).
144
145    quick_ratio()
146        Return an upper bound on .ratio() relatively quickly.
147
148    real_quick_ratio()
149        Return an upper bound on ratio() very quickly.
150    """
151
152    def __init__(self, isjunk=None, a='', b='', autojunk=True):
153        """Construct a SequenceMatcher.
154
155        Optional arg isjunk is None (the default), or a one-argument
156        function that takes a sequence element and returns true iff the
157        element is junk.  None is equivalent to passing "lambda x: 0", i.e.
158        no elements are considered to be junk.  For example, pass
159            lambda x: x in " \\t"
160        if you're comparing lines as sequences of characters, and don't
161        want to synch up on blanks or hard tabs.
162
163        Optional arg a is the first of two sequences to be compared.  By
164        default, an empty string.  The elements of a must be hashable.  See
165        also .set_seqs() and .set_seq1().
166
167        Optional arg b is the second of two sequences to be compared.  By
168        default, an empty string.  The elements of b must be hashable. See
169        also .set_seqs() and .set_seq2().
170
171        Optional arg autojunk should be set to False to disable the
172        "automatic junk heuristic" that treats popular elements as junk
173        (see module documentation for more information).
174        """
175
176        # Members:
177        # a
178        #      first sequence
179        # b
180        #      second sequence; differences are computed as "what do
181        #      we need to do to 'a' to change it into 'b'?"
182        # b2j
183        #      for x in b, b2j[x] is a list of the indices (into b)
184        #      at which x appears; junk and popular elements do not appear
185        # fullbcount
186        #      for x in b, fullbcount[x] == the number of times x
187        #      appears in b; only materialized if really needed (used
188        #      only for computing quick_ratio())
189        # matching_blocks
190        #      a list of (i, j, k) triples, where a[i:i+k] == b[j:j+k];
191        #      ascending & non-overlapping in i and in j; terminated by
192        #      a dummy (len(a), len(b), 0) sentinel
193        # opcodes
194        #      a list of (tag, i1, i2, j1, j2) tuples, where tag is
195        #      one of
196        #          'replace'   a[i1:i2] should be replaced by b[j1:j2]
197        #          'delete'    a[i1:i2] should be deleted
198        #          'insert'    b[j1:j2] should be inserted
199        #          'equal'     a[i1:i2] == b[j1:j2]
200        # isjunk
201        #      a user-supplied function taking a sequence element and
202        #      returning true iff the element is "junk" -- this has
203        #      subtle but helpful effects on the algorithm, which I'll
204        #      get around to writing up someday <0.9 wink>.
205        #      DON'T USE!  Only __chain_b uses this.  Use "in self.bjunk".
206        # bjunk
207        #      the items in b for which isjunk is True.
208        # bpopular
209        #      nonjunk items in b treated as junk by the heuristic (if used).
210
211        self.isjunk = isjunk
212        self.a = self.b = None
213        self.autojunk = autojunk
214        self.set_seqs(a, b)
215
216    def set_seqs(self, a, b):
217        """Set the two sequences to be compared.
218
219        >>> s = SequenceMatcher()
220        >>> s.set_seqs("abcd", "bcde")
221        >>> s.ratio()
222        0.75
223        """
224
225        self.set_seq1(a)
226        self.set_seq2(b)
227
228    def set_seq1(self, a):
229        """Set the first sequence to be compared.
230
231        The second sequence to be compared is not changed.
232
233        >>> s = SequenceMatcher(None, "abcd", "bcde")
234        >>> s.ratio()
235        0.75
236        >>> s.set_seq1("bcde")
237        >>> s.ratio()
238        1.0
239        >>>
240
241        SequenceMatcher computes and caches detailed information about the
242        second sequence, so if you want to compare one sequence S against
243        many sequences, use .set_seq2(S) once and call .set_seq1(x)
244        repeatedly for each of the other sequences.
245
246        See also set_seqs() and set_seq2().
247        """
248
249        if a is self.a:
250            return
251        self.a = a
252        self.matching_blocks = self.opcodes = None
253
254    def set_seq2(self, b):
255        """Set the second sequence to be compared.
256
257        The first sequence to be compared is not changed.
258
259        >>> s = SequenceMatcher(None, "abcd", "bcde")
260        >>> s.ratio()
261        0.75
262        >>> s.set_seq2("abcd")
263        >>> s.ratio()
264        1.0
265        >>>
266
267        SequenceMatcher computes and caches detailed information about the
268        second sequence, so if you want to compare one sequence S against
269        many sequences, use .set_seq2(S) once and call .set_seq1(x)
270        repeatedly for each of the other sequences.
271
272        See also set_seqs() and set_seq1().
273        """
274
275        if b is self.b:
276            return
277        self.b = b
278        self.matching_blocks = self.opcodes = None
279        self.fullbcount = None
280        self.__chain_b()
281
282    # For each element x in b, set b2j[x] to a list of the indices in
283    # b where x appears; the indices are in increasing order; note that
284    # the number of times x appears in b is len(b2j[x]) ...
285    # when self.isjunk is defined, junk elements don't show up in this
286    # map at all, which stops the central find_longest_match method
287    # from starting any matching block at a junk element ...
288    # b2j also does not contain entries for "popular" elements, meaning
289    # elements that account for more than 1 + 1% of the total elements, and
290    # when the sequence is reasonably large (>= 200 elements); this can
291    # be viewed as an adaptive notion of semi-junk, and yields an enormous
292    # speedup when, e.g., comparing program files with hundreds of
293    # instances of "return NULL;" ...
294    # note that this is only called when b changes; so for cross-product
295    # kinds of matches, it's best to call set_seq2 once, then set_seq1
296    # repeatedly
297
298    def __chain_b(self):
299        # Because isjunk is a user-defined (not C) function, and we test
300        # for junk a LOT, it's important to minimize the number of calls.
301        # Before the tricks described here, __chain_b was by far the most
302        # time-consuming routine in the whole module!  If anyone sees
303        # Jim Roskind, thank him again for profile.py -- I never would
304        # have guessed that.
305        # The first trick is to build b2j ignoring the possibility
306        # of junk.  I.e., we don't call isjunk at all yet.  Throwing
307        # out the junk later is much cheaper than building b2j "right"
308        # from the start.
309        b = self.b
310        self.b2j = b2j = {}
311
312        for i, elt in enumerate(b):
313            indices = b2j.setdefault(elt, [])
314            indices.append(i)
315
316        # Purge junk elements
317        self.bjunk = junk = set()
318        isjunk = self.isjunk
319        if isjunk:
320            for elt in b2j.keys():
321                if isjunk(elt):
322                    junk.add(elt)
323            for elt in junk: # separate loop avoids separate list of keys
324                del b2j[elt]
325
326        # Purge popular elements that are not junk
327        self.bpopular = popular = set()
328        n = len(b)
329        if self.autojunk and n >= 200:
330            ntest = n // 100 + 1
331            for elt, idxs in b2j.items():
332                if len(idxs) > ntest:
333                    popular.add(elt)
334            for elt in popular: # ditto; as fast for 1% deletion
335                del b2j[elt]
336
337    def find_longest_match(self, alo=0, ahi=None, blo=0, bhi=None):
338        """Find longest matching block in a[alo:ahi] and b[blo:bhi].
339
340        By default it will find the longest match in the entirety of a and b.
341
342        If isjunk is not defined:
343
344        Return (i,j,k) such that a[i:i+k] is equal to b[j:j+k], where
345            alo <= i <= i+k <= ahi
346            blo <= j <= j+k <= bhi
347        and for all (i',j',k') meeting those conditions,
348            k >= k'
349            i <= i'
350            and if i == i', j <= j'
351
352        In other words, of all maximal matching blocks, return one that
353        starts earliest in a, and of all those maximal matching blocks that
354        start earliest in a, return the one that starts earliest in b.
355
356        >>> s = SequenceMatcher(None, " abcd", "abcd abcd")
357        >>> s.find_longest_match(0, 5, 0, 9)
358        Match(a=0, b=4, size=5)
359
360        If isjunk is defined, first the longest matching block is
361        determined as above, but with the additional restriction that no
362        junk element appears in the block.  Then that block is extended as
363        far as possible by matching (only) junk elements on both sides.  So
364        the resulting block never matches on junk except as identical junk
365        happens to be adjacent to an "interesting" match.
366
367        Here's the same example as before, but considering blanks to be
368        junk.  That prevents " abcd" from matching the " abcd" at the tail
369        end of the second sequence directly.  Instead only the "abcd" can
370        match, and matches the leftmost "abcd" in the second sequence:
371
372        >>> s = SequenceMatcher(lambda x: x==" ", " abcd", "abcd abcd")
373        >>> s.find_longest_match(0, 5, 0, 9)
374        Match(a=1, b=0, size=4)
375
376        If no blocks match, return (alo, blo, 0).
377
378        >>> s = SequenceMatcher(None, "ab", "c")
379        >>> s.find_longest_match(0, 2, 0, 1)
380        Match(a=0, b=0, size=0)
381        """
382
383        # CAUTION:  stripping common prefix or suffix would be incorrect.
384        # E.g.,
385        #    ab
386        #    acab
387        # Longest matching block is "ab", but if common prefix is
388        # stripped, it's "a" (tied with "b").  UNIX(tm) diff does so
389        # strip, so ends up claiming that ab is changed to acab by
390        # inserting "ca" in the middle.  That's minimal but unintuitive:
391        # "it's obvious" that someone inserted "ac" at the front.
392        # Windiff ends up at the same place as diff, but by pairing up
393        # the unique 'b's and then matching the first two 'a's.
394
395        a, b, b2j, isbjunk = self.a, self.b, self.b2j, self.bjunk.__contains__
396        if ahi is None:
397            ahi = len(a)
398        if bhi is None:
399            bhi = len(b)
400        besti, bestj, bestsize = alo, blo, 0
401        # find longest junk-free match
402        # during an iteration of the loop, j2len[j] = length of longest
403        # junk-free match ending with a[i-1] and b[j]
404        j2len = {}
405        nothing = []
406        for i in range(alo, ahi):
407            # look at all instances of a[i] in b; note that because
408            # b2j has no junk keys, the loop is skipped if a[i] is junk
409            j2lenget = j2len.get
410            newj2len = {}
411            for j in b2j.get(a[i], nothing):
412                # a[i] matches b[j]
413                if j < blo:
414                    continue
415                if j >= bhi:
416                    break
417                k = newj2len[j] = j2lenget(j-1, 0) + 1
418                if k > bestsize:
419                    besti, bestj, bestsize = i-k+1, j-k+1, k
420            j2len = newj2len
421
422        # Extend the best by non-junk elements on each end.  In particular,
423        # "popular" non-junk elements aren't in b2j, which greatly speeds
424        # the inner loop above, but also means "the best" match so far
425        # doesn't contain any junk *or* popular non-junk elements.
426        while besti > alo and bestj > blo and \
427              not isbjunk(b[bestj-1]) and \
428              a[besti-1] == b[bestj-1]:
429            besti, bestj, bestsize = besti-1, bestj-1, bestsize+1
430        while besti+bestsize < ahi and bestj+bestsize < bhi and \
431              not isbjunk(b[bestj+bestsize]) and \
432              a[besti+bestsize] == b[bestj+bestsize]:
433            bestsize += 1
434
435        # Now that we have a wholly interesting match (albeit possibly
436        # empty!), we may as well suck up the matching junk on each
437        # side of it too.  Can't think of a good reason not to, and it
438        # saves post-processing the (possibly considerable) expense of
439        # figuring out what to do with it.  In the case of an empty
440        # interesting match, this is clearly the right thing to do,
441        # because no other kind of match is possible in the regions.
442        while besti > alo and bestj > blo and \
443              isbjunk(b[bestj-1]) and \
444              a[besti-1] == b[bestj-1]:
445            besti, bestj, bestsize = besti-1, bestj-1, bestsize+1
446        while besti+bestsize < ahi and bestj+bestsize < bhi and \
447              isbjunk(b[bestj+bestsize]) and \
448              a[besti+bestsize] == b[bestj+bestsize]:
449            bestsize = bestsize + 1
450
451        return Match(besti, bestj, bestsize)
452
453    def get_matching_blocks(self):
454        """Return list of triples describing matching subsequences.
455
456        Each triple is of the form (i, j, n), and means that
457        a[i:i+n] == b[j:j+n].  The triples are monotonically increasing in
458        i and in j.  New in Python 2.5, it's also guaranteed that if
459        (i, j, n) and (i', j', n') are adjacent triples in the list, and
460        the second is not the last triple in the list, then i+n != i' or
461        j+n != j'.  IOW, adjacent triples never describe adjacent equal
462        blocks.
463
464        The last triple is a dummy, (len(a), len(b), 0), and is the only
465        triple with n==0.
466
467        >>> s = SequenceMatcher(None, "abxcd", "abcd")
468        >>> list(s.get_matching_blocks())
469        [Match(a=0, b=0, size=2), Match(a=3, b=2, size=2), Match(a=5, b=4, size=0)]
470        """
471
472        if self.matching_blocks is not None:
473            return self.matching_blocks
474        la, lb = len(self.a), len(self.b)
475
476        # This is most naturally expressed as a recursive algorithm, but
477        # at least one user bumped into extreme use cases that exceeded
478        # the recursion limit on their box.  So, now we maintain a list
479        # ('queue`) of blocks we still need to look at, and append partial
480        # results to `matching_blocks` in a loop; the matches are sorted
481        # at the end.
482        queue = [(0, la, 0, lb)]
483        matching_blocks = []
484        while queue:
485            alo, ahi, blo, bhi = queue.pop()
486            i, j, k = x = self.find_longest_match(alo, ahi, blo, bhi)
487            # a[alo:i] vs b[blo:j] unknown
488            # a[i:i+k] same as b[j:j+k]
489            # a[i+k:ahi] vs b[j+k:bhi] unknown
490            if k:   # if k is 0, there was no matching block
491                matching_blocks.append(x)
492                if alo < i and blo < j:
493                    queue.append((alo, i, blo, j))
494                if i+k < ahi and j+k < bhi:
495                    queue.append((i+k, ahi, j+k, bhi))
496        matching_blocks.sort()
497
498        # It's possible that we have adjacent equal blocks in the
499        # matching_blocks list now.  Starting with 2.5, this code was added
500        # to collapse them.
501        i1 = j1 = k1 = 0
502        non_adjacent = []
503        for i2, j2, k2 in matching_blocks:
504            # Is this block adjacent to i1, j1, k1?
505            if i1 + k1 == i2 and j1 + k1 == j2:
506                # Yes, so collapse them -- this just increases the length of
507                # the first block by the length of the second, and the first
508                # block so lengthened remains the block to compare against.
509                k1 += k2
510            else:
511                # Not adjacent.  Remember the first block (k1==0 means it's
512                # the dummy we started with), and make the second block the
513                # new block to compare against.
514                if k1:
515                    non_adjacent.append((i1, j1, k1))
516                i1, j1, k1 = i2, j2, k2
517        if k1:
518            non_adjacent.append((i1, j1, k1))
519
520        non_adjacent.append( (la, lb, 0) )
521        self.matching_blocks = list(map(Match._make, non_adjacent))
522        return self.matching_blocks
523
524    def get_opcodes(self):
525        """Return list of 5-tuples describing how to turn a into b.
526
527        Each tuple is of the form (tag, i1, i2, j1, j2).  The first tuple
528        has i1 == j1 == 0, and remaining tuples have i1 == the i2 from the
529        tuple preceding it, and likewise for j1 == the previous j2.
530
531        The tags are strings, with these meanings:
532
533        'replace':  a[i1:i2] should be replaced by b[j1:j2]
534        'delete':   a[i1:i2] should be deleted.
535                    Note that j1==j2 in this case.
536        'insert':   b[j1:j2] should be inserted at a[i1:i1].
537                    Note that i1==i2 in this case.
538        'equal':    a[i1:i2] == b[j1:j2]
539
540        >>> a = "qabxcd"
541        >>> b = "abycdf"
542        >>> s = SequenceMatcher(None, a, b)
543        >>> for tag, i1, i2, j1, j2 in s.get_opcodes():
544        ...    print(("%7s a[%d:%d] (%s) b[%d:%d] (%s)" %
545        ...           (tag, i1, i2, a[i1:i2], j1, j2, b[j1:j2])))
546         delete a[0:1] (q) b[0:0] ()
547          equal a[1:3] (ab) b[0:2] (ab)
548        replace a[3:4] (x) b[2:3] (y)
549          equal a[4:6] (cd) b[3:5] (cd)
550         insert a[6:6] () b[5:6] (f)
551        """
552
553        if self.opcodes is not None:
554            return self.opcodes
555        i = j = 0
556        self.opcodes = answer = []
557        for ai, bj, size in self.get_matching_blocks():
558            # invariant:  we've pumped out correct diffs to change
559            # a[:i] into b[:j], and the next matching block is
560            # a[ai:ai+size] == b[bj:bj+size].  So we need to pump
561            # out a diff to change a[i:ai] into b[j:bj], pump out
562            # the matching block, and move (i,j) beyond the match
563            tag = ''
564            if i < ai and j < bj:
565                tag = 'replace'
566            elif i < ai:
567                tag = 'delete'
568            elif j < bj:
569                tag = 'insert'
570            if tag:
571                answer.append( (tag, i, ai, j, bj) )
572            i, j = ai+size, bj+size
573            # the list of matching blocks is terminated by a
574            # sentinel with size 0
575            if size:
576                answer.append( ('equal', ai, i, bj, j) )
577        return answer
578
579    def get_grouped_opcodes(self, n=3):
580        """ Isolate change clusters by eliminating ranges with no changes.
581
582        Return a generator of groups with up to n lines of context.
583        Each group is in the same format as returned by get_opcodes().
584
585        >>> from pprint import pprint
586        >>> a = list(map(str, range(1,40)))
587        >>> b = a[:]
588        >>> b[8:8] = ['i']     # Make an insertion
589        >>> b[20] += 'x'       # Make a replacement
590        >>> b[23:28] = []      # Make a deletion
591        >>> b[30] += 'y'       # Make another replacement
592        >>> pprint(list(SequenceMatcher(None,a,b).get_grouped_opcodes()))
593        [[('equal', 5, 8, 5, 8), ('insert', 8, 8, 8, 9), ('equal', 8, 11, 9, 12)],
594         [('equal', 16, 19, 17, 20),
595          ('replace', 19, 20, 20, 21),
596          ('equal', 20, 22, 21, 23),
597          ('delete', 22, 27, 23, 23),
598          ('equal', 27, 30, 23, 26)],
599         [('equal', 31, 34, 27, 30),
600          ('replace', 34, 35, 30, 31),
601          ('equal', 35, 38, 31, 34)]]
602        """
603
604        codes = self.get_opcodes()
605        if not codes:
606            codes = [("equal", 0, 1, 0, 1)]
607        # Fixup leading and trailing groups if they show no changes.
608        if codes[0][0] == 'equal':
609            tag, i1, i2, j1, j2 = codes[0]
610            codes[0] = tag, max(i1, i2-n), i2, max(j1, j2-n), j2
611        if codes[-1][0] == 'equal':
612            tag, i1, i2, j1, j2 = codes[-1]
613            codes[-1] = tag, i1, min(i2, i1+n), j1, min(j2, j1+n)
614
615        nn = n + n
616        group = []
617        for tag, i1, i2, j1, j2 in codes:
618            # End the current group and start a new one whenever
619            # there is a large range with no changes.
620            if tag == 'equal' and i2-i1 > nn:
621                group.append((tag, i1, min(i2, i1+n), j1, min(j2, j1+n)))
622                yield group
623                group = []
624                i1, j1 = max(i1, i2-n), max(j1, j2-n)
625            group.append((tag, i1, i2, j1 ,j2))
626        if group and not (len(group)==1 and group[0][0] == 'equal'):
627            yield group
628
629    def ratio(self):
630        """Return a measure of the sequences' similarity (float in [0,1]).
631
632        Where T is the total number of elements in both sequences, and
633        M is the number of matches, this is 2.0*M / T.
634        Note that this is 1 if the sequences are identical, and 0 if
635        they have nothing in common.
636
637        .ratio() is expensive to compute if you haven't already computed
638        .get_matching_blocks() or .get_opcodes(), in which case you may
639        want to try .quick_ratio() or .real_quick_ratio() first to get an
640        upper bound.
641
642        >>> s = SequenceMatcher(None, "abcd", "bcde")
643        >>> s.ratio()
644        0.75
645        >>> s.quick_ratio()
646        0.75
647        >>> s.real_quick_ratio()
648        1.0
649        """
650
651        matches = sum(triple[-1] for triple in self.get_matching_blocks())
652        return _calculate_ratio(matches, len(self.a) + len(self.b))
653
654    def quick_ratio(self):
655        """Return an upper bound on ratio() relatively quickly.
656
657        This isn't defined beyond that it is an upper bound on .ratio(), and
658        is faster to compute.
659        """
660
661        # viewing a and b as multisets, set matches to the cardinality
662        # of their intersection; this counts the number of matches
663        # without regard to order, so is clearly an upper bound
664        if self.fullbcount is None:
665            self.fullbcount = fullbcount = {}
666            for elt in self.b:
667                fullbcount[elt] = fullbcount.get(elt, 0) + 1
668        fullbcount = self.fullbcount
669        # avail[x] is the number of times x appears in 'b' less the
670        # number of times we've seen it in 'a' so far ... kinda
671        avail = {}
672        availhas, matches = avail.__contains__, 0
673        for elt in self.a:
674            if availhas(elt):
675                numb = avail[elt]
676            else:
677                numb = fullbcount.get(elt, 0)
678            avail[elt] = numb - 1
679            if numb > 0:
680                matches = matches + 1
681        return _calculate_ratio(matches, len(self.a) + len(self.b))
682
683    def real_quick_ratio(self):
684        """Return an upper bound on ratio() very quickly.
685
686        This isn't defined beyond that it is an upper bound on .ratio(), and
687        is faster to compute than either .ratio() or .quick_ratio().
688        """
689
690        la, lb = len(self.a), len(self.b)
691        # can't have more matches than the number of elements in the
692        # shorter sequence
693        return _calculate_ratio(min(la, lb), la + lb)
694
695    __class_getitem__ = classmethod(GenericAlias)
696
697
698def get_close_matches(word, possibilities, n=3, cutoff=0.6):
699    """Use SequenceMatcher to return list of the best "good enough" matches.
700
701    word is a sequence for which close matches are desired (typically a
702    string).
703
704    possibilities is a list of sequences against which to match word
705    (typically a list of strings).
706
707    Optional arg n (default 3) is the maximum number of close matches to
708    return.  n must be > 0.
709
710    Optional arg cutoff (default 0.6) is a float in [0, 1].  Possibilities
711    that don't score at least that similar to word are ignored.
712
713    The best (no more than n) matches among the possibilities are returned
714    in a list, sorted by similarity score, most similar first.
715
716    >>> get_close_matches("appel", ["ape", "apple", "peach", "puppy"])
717    ['apple', 'ape']
718    >>> import keyword as _keyword
719    >>> get_close_matches("wheel", _keyword.kwlist)
720    ['while']
721    >>> get_close_matches("Apple", _keyword.kwlist)
722    []
723    >>> get_close_matches("accept", _keyword.kwlist)
724    ['except']
725    """
726
727    if not n >  0:
728        raise ValueError("n must be > 0: %r" % (n,))
729    if not 0.0 <= cutoff <= 1.0:
730        raise ValueError("cutoff must be in [0.0, 1.0]: %r" % (cutoff,))
731    result = []
732    s = SequenceMatcher()
733    s.set_seq2(word)
734    for x in possibilities:
735        s.set_seq1(x)
736        if s.real_quick_ratio() >= cutoff and \
737           s.quick_ratio() >= cutoff and \
738           s.ratio() >= cutoff:
739            result.append((s.ratio(), x))
740
741    # Move the best scorers to head of list
742    result = _nlargest(n, result)
743    # Strip scores for the best n matches
744    return [x for score, x in result]
745
746
747def _keep_original_ws(s, tag_s):
748    """Replace whitespace with the original whitespace characters in `s`"""
749    return ''.join(
750        c if tag_c == " " and c.isspace() else tag_c
751        for c, tag_c in zip(s, tag_s)
752    )
753
754
755
756class Differ:
757    r"""
758    Differ is a class for comparing sequences of lines of text, and
759    producing human-readable differences or deltas.  Differ uses
760    SequenceMatcher both to compare sequences of lines, and to compare
761    sequences of characters within similar (near-matching) lines.
762
763    Each line of a Differ delta begins with a two-letter code:
764
765        '- '    line unique to sequence 1
766        '+ '    line unique to sequence 2
767        '  '    line common to both sequences
768        '? '    line not present in either input sequence
769
770    Lines beginning with '? ' attempt to guide the eye to intraline
771    differences, and were not present in either input sequence.  These lines
772    can be confusing if the sequences contain tab characters.
773
774    Note that Differ makes no claim to produce a *minimal* diff.  To the
775    contrary, minimal diffs are often counter-intuitive, because they synch
776    up anywhere possible, sometimes accidental matches 100 pages apart.
777    Restricting synch points to contiguous matches preserves some notion of
778    locality, at the occasional cost of producing a longer diff.
779
780    Example: Comparing two texts.
781
782    First we set up the texts, sequences of individual single-line strings
783    ending with newlines (such sequences can also be obtained from the
784    `readlines()` method of file-like objects):
785
786    >>> text1 = '''  1. Beautiful is better than ugly.
787    ...   2. Explicit is better than implicit.
788    ...   3. Simple is better than complex.
789    ...   4. Complex is better than complicated.
790    ... '''.splitlines(keepends=True)
791    >>> len(text1)
792    4
793    >>> text1[0][-1]
794    '\n'
795    >>> text2 = '''  1. Beautiful is better than ugly.
796    ...   3.   Simple is better than complex.
797    ...   4. Complicated is better than complex.
798    ...   5. Flat is better than nested.
799    ... '''.splitlines(keepends=True)
800
801    Next we instantiate a Differ object:
802
803    >>> d = Differ()
804
805    Note that when instantiating a Differ object we may pass functions to
806    filter out line and character 'junk'.  See Differ.__init__ for details.
807
808    Finally, we compare the two:
809
810    >>> result = list(d.compare(text1, text2))
811
812    'result' is a list of strings, so let's pretty-print it:
813
814    >>> from pprint import pprint as _pprint
815    >>> _pprint(result)
816    ['    1. Beautiful is better than ugly.\n',
817     '-   2. Explicit is better than implicit.\n',
818     '-   3. Simple is better than complex.\n',
819     '+   3.   Simple is better than complex.\n',
820     '?     ++\n',
821     '-   4. Complex is better than complicated.\n',
822     '?            ^                     ---- ^\n',
823     '+   4. Complicated is better than complex.\n',
824     '?           ++++ ^                      ^\n',
825     '+   5. Flat is better than nested.\n']
826
827    As a single multi-line string it looks like this:
828
829    >>> print(''.join(result), end="")
830        1. Beautiful is better than ugly.
831    -   2. Explicit is better than implicit.
832    -   3. Simple is better than complex.
833    +   3.   Simple is better than complex.
834    ?     ++
835    -   4. Complex is better than complicated.
836    ?            ^                     ---- ^
837    +   4. Complicated is better than complex.
838    ?           ++++ ^                      ^
839    +   5. Flat is better than nested.
840
841    Methods:
842
843    __init__(linejunk=None, charjunk=None)
844        Construct a text differencer, with optional filters.
845
846    compare(a, b)
847        Compare two sequences of lines; generate the resulting delta.
848    """
849
850    def __init__(self, linejunk=None, charjunk=None):
851        """
852        Construct a text differencer, with optional filters.
853
854        The two optional keyword parameters are for filter functions:
855
856        - `linejunk`: A function that should accept a single string argument,
857          and return true iff the string is junk. The module-level function
858          `IS_LINE_JUNK` may be used to filter out lines without visible
859          characters, except for at most one splat ('#').  It is recommended
860          to leave linejunk None; the underlying SequenceMatcher class has
861          an adaptive notion of "noise" lines that's better than any static
862          definition the author has ever been able to craft.
863
864        - `charjunk`: A function that should accept a string of length 1. The
865          module-level function `IS_CHARACTER_JUNK` may be used to filter out
866          whitespace characters (a blank or tab; **note**: bad idea to include
867          newline in this!).  Use of IS_CHARACTER_JUNK is recommended.
868        """
869
870        self.linejunk = linejunk
871        self.charjunk = charjunk
872
873    def compare(self, a, b):
874        r"""
875        Compare two sequences of lines; generate the resulting delta.
876
877        Each sequence must contain individual single-line strings ending with
878        newlines. Such sequences can be obtained from the `readlines()` method
879        of file-like objects.  The delta generated also consists of newline-
880        terminated strings, ready to be printed as-is via the writeline()
881        method of a file-like object.
882
883        Example:
884
885        >>> print(''.join(Differ().compare('one\ntwo\nthree\n'.splitlines(True),
886        ...                                'ore\ntree\nemu\n'.splitlines(True))),
887        ...       end="")
888        - one
889        ?  ^
890        + ore
891        ?  ^
892        - two
893        - three
894        ?  -
895        + tree
896        + emu
897        """
898
899        cruncher = SequenceMatcher(self.linejunk, a, b)
900        for tag, alo, ahi, blo, bhi in cruncher.get_opcodes():
901            if tag == 'replace':
902                g = self._fancy_replace(a, alo, ahi, b, blo, bhi)
903            elif tag == 'delete':
904                g = self._dump('-', a, alo, ahi)
905            elif tag == 'insert':
906                g = self._dump('+', b, blo, bhi)
907            elif tag == 'equal':
908                g = self._dump(' ', a, alo, ahi)
909            else:
910                raise ValueError('unknown tag %r' % (tag,))
911
912            yield from g
913
914    def _dump(self, tag, x, lo, hi):
915        """Generate comparison results for a same-tagged range."""
916        for i in range(lo, hi):
917            yield '%s %s' % (tag, x[i])
918
919    def _plain_replace(self, a, alo, ahi, b, blo, bhi):
920        assert alo < ahi and blo < bhi
921        # dump the shorter block first -- reduces the burden on short-term
922        # memory if the blocks are of very different sizes
923        if bhi - blo < ahi - alo:
924            first  = self._dump('+', b, blo, bhi)
925            second = self._dump('-', a, alo, ahi)
926        else:
927            first  = self._dump('-', a, alo, ahi)
928            second = self._dump('+', b, blo, bhi)
929
930        for g in first, second:
931            yield from g
932
933    def _fancy_replace(self, a, alo, ahi, b, blo, bhi):
934        r"""
935        When replacing one block of lines with another, search the blocks
936        for *similar* lines; the best-matching pair (if any) is used as a
937        synch point, and intraline difference marking is done on the
938        similar pair. Lots of work, but often worth it.
939
940        Example:
941
942        >>> d = Differ()
943        >>> results = d._fancy_replace(['abcDefghiJkl\n'], 0, 1,
944        ...                            ['abcdefGhijkl\n'], 0, 1)
945        >>> print(''.join(results), end="")
946        - abcDefghiJkl
947        ?    ^  ^  ^
948        + abcdefGhijkl
949        ?    ^  ^  ^
950        """
951
952        # don't synch up unless the lines have a similarity score of at
953        # least cutoff; best_ratio tracks the best score seen so far
954        best_ratio, cutoff = 0.74, 0.75
955        cruncher = SequenceMatcher(self.charjunk)
956        eqi, eqj = None, None   # 1st indices of equal lines (if any)
957
958        # search for the pair that matches best without being identical
959        # (identical lines must be junk lines, & we don't want to synch up
960        # on junk -- unless we have to)
961        for j in range(blo, bhi):
962            bj = b[j]
963            cruncher.set_seq2(bj)
964            for i in range(alo, ahi):
965                ai = a[i]
966                if ai == bj:
967                    if eqi is None:
968                        eqi, eqj = i, j
969                    continue
970                cruncher.set_seq1(ai)
971                # computing similarity is expensive, so use the quick
972                # upper bounds first -- have seen this speed up messy
973                # compares by a factor of 3.
974                # note that ratio() is only expensive to compute the first
975                # time it's called on a sequence pair; the expensive part
976                # of the computation is cached by cruncher
977                if cruncher.real_quick_ratio() > best_ratio and \
978                      cruncher.quick_ratio() > best_ratio and \
979                      cruncher.ratio() > best_ratio:
980                    best_ratio, best_i, best_j = cruncher.ratio(), i, j
981        if best_ratio < cutoff:
982            # no non-identical "pretty close" pair
983            if eqi is None:
984                # no identical pair either -- treat it as a straight replace
985                yield from self._plain_replace(a, alo, ahi, b, blo, bhi)
986                return
987            # no close pair, but an identical pair -- synch up on that
988            best_i, best_j, best_ratio = eqi, eqj, 1.0
989        else:
990            # there's a close pair, so forget the identical pair (if any)
991            eqi = None
992
993        # a[best_i] very similar to b[best_j]; eqi is None iff they're not
994        # identical
995
996        # pump out diffs from before the synch point
997        yield from self._fancy_helper(a, alo, best_i, b, blo, best_j)
998
999        # do intraline marking on the synch pair
1000        aelt, belt = a[best_i], b[best_j]
1001        if eqi is None:
1002            # pump out a '-', '?', '+', '?' quad for the synched lines
1003            atags = btags = ""
1004            cruncher.set_seqs(aelt, belt)
1005            for tag, ai1, ai2, bj1, bj2 in cruncher.get_opcodes():
1006                la, lb = ai2 - ai1, bj2 - bj1
1007                if tag == 'replace':
1008                    atags += '^' * la
1009                    btags += '^' * lb
1010                elif tag == 'delete':
1011                    atags += '-' * la
1012                elif tag == 'insert':
1013                    btags += '+' * lb
1014                elif tag == 'equal':
1015                    atags += ' ' * la
1016                    btags += ' ' * lb
1017                else:
1018                    raise ValueError('unknown tag %r' % (tag,))
1019            yield from self._qformat(aelt, belt, atags, btags)
1020        else:
1021            # the synch pair is identical
1022            yield '  ' + aelt
1023
1024        # pump out diffs from after the synch point
1025        yield from self._fancy_helper(a, best_i+1, ahi, b, best_j+1, bhi)
1026
1027    def _fancy_helper(self, a, alo, ahi, b, blo, bhi):
1028        g = []
1029        if alo < ahi:
1030            if blo < bhi:
1031                g = self._fancy_replace(a, alo, ahi, b, blo, bhi)
1032            else:
1033                g = self._dump('-', a, alo, ahi)
1034        elif blo < bhi:
1035            g = self._dump('+', b, blo, bhi)
1036
1037        yield from g
1038
1039    def _qformat(self, aline, bline, atags, btags):
1040        r"""
1041        Format "?" output and deal with tabs.
1042
1043        Example:
1044
1045        >>> d = Differ()
1046        >>> results = d._qformat('\tabcDefghiJkl\n', '\tabcdefGhijkl\n',
1047        ...                      '  ^ ^  ^      ', '  ^ ^  ^      ')
1048        >>> for line in results: print(repr(line))
1049        ...
1050        '- \tabcDefghiJkl\n'
1051        '? \t ^ ^  ^\n'
1052        '+ \tabcdefGhijkl\n'
1053        '? \t ^ ^  ^\n'
1054        """
1055        atags = _keep_original_ws(aline, atags).rstrip()
1056        btags = _keep_original_ws(bline, btags).rstrip()
1057
1058        yield "- " + aline
1059        if atags:
1060            yield f"? {atags}\n"
1061
1062        yield "+ " + bline
1063        if btags:
1064            yield f"? {btags}\n"
1065
1066# With respect to junk, an earlier version of ndiff simply refused to
1067# *start* a match with a junk element.  The result was cases like this:
1068#     before: private Thread currentThread;
1069#     after:  private volatile Thread currentThread;
1070# If you consider whitespace to be junk, the longest contiguous match
1071# not starting with junk is "e Thread currentThread".  So ndiff reported
1072# that "e volatil" was inserted between the 't' and the 'e' in "private".
1073# While an accurate view, to people that's absurd.  The current version
1074# looks for matching blocks that are entirely junk-free, then extends the
1075# longest one of those as far as possible but only with matching junk.
1076# So now "currentThread" is matched, then extended to suck up the
1077# preceding blank; then "private" is matched, and extended to suck up the
1078# following blank; then "Thread" is matched; and finally ndiff reports
1079# that "volatile " was inserted before "Thread".  The only quibble
1080# remaining is that perhaps it was really the case that " volatile"
1081# was inserted after "private".  I can live with that <wink>.
1082
1083import re
1084
1085def IS_LINE_JUNK(line, pat=re.compile(r"\s*(?:#\s*)?$").match):
1086    r"""
1087    Return True for ignorable line: iff `line` is blank or contains a single '#'.
1088
1089    Examples:
1090
1091    >>> IS_LINE_JUNK('\n')
1092    True
1093    >>> IS_LINE_JUNK('  #   \n')
1094    True
1095    >>> IS_LINE_JUNK('hello\n')
1096    False
1097    """
1098
1099    return pat(line) is not None
1100
1101def IS_CHARACTER_JUNK(ch, ws=" \t"):
1102    r"""
1103    Return True for ignorable character: iff `ch` is a space or tab.
1104
1105    Examples:
1106
1107    >>> IS_CHARACTER_JUNK(' ')
1108    True
1109    >>> IS_CHARACTER_JUNK('\t')
1110    True
1111    >>> IS_CHARACTER_JUNK('\n')
1112    False
1113    >>> IS_CHARACTER_JUNK('x')
1114    False
1115    """
1116
1117    return ch in ws
1118
1119
1120########################################################################
1121###  Unified Diff
1122########################################################################
1123
1124def _format_range_unified(start, stop):
1125    'Convert range to the "ed" format'
1126    # Per the diff spec at http://www.unix.org/single_unix_specification/
1127    beginning = start + 1     # lines start numbering with one
1128    length = stop - start
1129    if length == 1:
1130        return '{}'.format(beginning)
1131    if not length:
1132        beginning -= 1        # empty ranges begin at line just before the range
1133    return '{},{}'.format(beginning, length)
1134
1135def unified_diff(a, b, fromfile='', tofile='', fromfiledate='',
1136                 tofiledate='', n=3, lineterm='\n'):
1137    r"""
1138    Compare two sequences of lines; generate the delta as a unified diff.
1139
1140    Unified diffs are a compact way of showing line changes and a few
1141    lines of context.  The number of context lines is set by 'n' which
1142    defaults to three.
1143
1144    By default, the diff control lines (those with ---, +++, or @@) are
1145    created with a trailing newline.  This is helpful so that inputs
1146    created from file.readlines() result in diffs that are suitable for
1147    file.writelines() since both the inputs and outputs have trailing
1148    newlines.
1149
1150    For inputs that do not have trailing newlines, set the lineterm
1151    argument to "" so that the output will be uniformly newline free.
1152
1153    The unidiff format normally has a header for filenames and modification
1154    times.  Any or all of these may be specified using strings for
1155    'fromfile', 'tofile', 'fromfiledate', and 'tofiledate'.
1156    The modification times are normally expressed in the ISO 8601 format.
1157
1158    Example:
1159
1160    >>> for line in unified_diff('one two three four'.split(),
1161    ...             'zero one tree four'.split(), 'Original', 'Current',
1162    ...             '2005-01-26 23:30:50', '2010-04-02 10:20:52',
1163    ...             lineterm=''):
1164    ...     print(line)                 # doctest: +NORMALIZE_WHITESPACE
1165    --- Original        2005-01-26 23:30:50
1166    +++ Current         2010-04-02 10:20:52
1167    @@ -1,4 +1,4 @@
1168    +zero
1169     one
1170    -two
1171    -three
1172    +tree
1173     four
1174    """
1175
1176    _check_types(a, b, fromfile, tofile, fromfiledate, tofiledate, lineterm)
1177    started = False
1178    for group in SequenceMatcher(None,a,b).get_grouped_opcodes(n):
1179        if not started:
1180            started = True
1181            fromdate = '\t{}'.format(fromfiledate) if fromfiledate else ''
1182            todate = '\t{}'.format(tofiledate) if tofiledate else ''
1183            yield '--- {}{}{}'.format(fromfile, fromdate, lineterm)
1184            yield '+++ {}{}{}'.format(tofile, todate, lineterm)
1185
1186        first, last = group[0], group[-1]
1187        file1_range = _format_range_unified(first[1], last[2])
1188        file2_range = _format_range_unified(first[3], last[4])
1189        yield '@@ -{} +{} @@{}'.format(file1_range, file2_range, lineterm)
1190
1191        for tag, i1, i2, j1, j2 in group:
1192            if tag == 'equal':
1193                for line in a[i1:i2]:
1194                    yield ' ' + line
1195                continue
1196            if tag in {'replace', 'delete'}:
1197                for line in a[i1:i2]:
1198                    yield '-' + line
1199            if tag in {'replace', 'insert'}:
1200                for line in b[j1:j2]:
1201                    yield '+' + line
1202
1203
1204########################################################################
1205###  Context Diff
1206########################################################################
1207
1208def _format_range_context(start, stop):
1209    'Convert range to the "ed" format'
1210    # Per the diff spec at http://www.unix.org/single_unix_specification/
1211    beginning = start + 1     # lines start numbering with one
1212    length = stop - start
1213    if not length:
1214        beginning -= 1        # empty ranges begin at line just before the range
1215    if length <= 1:
1216        return '{}'.format(beginning)
1217    return '{},{}'.format(beginning, beginning + length - 1)
1218
1219# See http://www.unix.org/single_unix_specification/
1220def context_diff(a, b, fromfile='', tofile='',
1221                 fromfiledate='', tofiledate='', n=3, lineterm='\n'):
1222    r"""
1223    Compare two sequences of lines; generate the delta as a context diff.
1224
1225    Context diffs are a compact way of showing line changes and a few
1226    lines of context.  The number of context lines is set by 'n' which
1227    defaults to three.
1228
1229    By default, the diff control lines (those with *** or ---) are
1230    created with a trailing newline.  This is helpful so that inputs
1231    created from file.readlines() result in diffs that are suitable for
1232    file.writelines() since both the inputs and outputs have trailing
1233    newlines.
1234
1235    For inputs that do not have trailing newlines, set the lineterm
1236    argument to "" so that the output will be uniformly newline free.
1237
1238    The context diff format normally has a header for filenames and
1239    modification times.  Any or all of these may be specified using
1240    strings for 'fromfile', 'tofile', 'fromfiledate', and 'tofiledate'.
1241    The modification times are normally expressed in the ISO 8601 format.
1242    If not specified, the strings default to blanks.
1243
1244    Example:
1245
1246    >>> print(''.join(context_diff('one\ntwo\nthree\nfour\n'.splitlines(True),
1247    ...       'zero\none\ntree\nfour\n'.splitlines(True), 'Original', 'Current')),
1248    ...       end="")
1249    *** Original
1250    --- Current
1251    ***************
1252    *** 1,4 ****
1253      one
1254    ! two
1255    ! three
1256      four
1257    --- 1,4 ----
1258    + zero
1259      one
1260    ! tree
1261      four
1262    """
1263
1264    _check_types(a, b, fromfile, tofile, fromfiledate, tofiledate, lineterm)
1265    prefix = dict(insert='+ ', delete='- ', replace='! ', equal='  ')
1266    started = False
1267    for group in SequenceMatcher(None,a,b).get_grouped_opcodes(n):
1268        if not started:
1269            started = True
1270            fromdate = '\t{}'.format(fromfiledate) if fromfiledate else ''
1271            todate = '\t{}'.format(tofiledate) if tofiledate else ''
1272            yield '*** {}{}{}'.format(fromfile, fromdate, lineterm)
1273            yield '--- {}{}{}'.format(tofile, todate, lineterm)
1274
1275        first, last = group[0], group[-1]
1276        yield '***************' + lineterm
1277
1278        file1_range = _format_range_context(first[1], last[2])
1279        yield '*** {} ****{}'.format(file1_range, lineterm)
1280
1281        if any(tag in {'replace', 'delete'} for tag, _, _, _, _ in group):
1282            for tag, i1, i2, _, _ in group:
1283                if tag != 'insert':
1284                    for line in a[i1:i2]:
1285                        yield prefix[tag] + line
1286
1287        file2_range = _format_range_context(first[3], last[4])
1288        yield '--- {} ----{}'.format(file2_range, lineterm)
1289
1290        if any(tag in {'replace', 'insert'} for tag, _, _, _, _ in group):
1291            for tag, _, _, j1, j2 in group:
1292                if tag != 'delete':
1293                    for line in b[j1:j2]:
1294                        yield prefix[tag] + line
1295
1296def _check_types(a, b, *args):
1297    # Checking types is weird, but the alternative is garbled output when
1298    # someone passes mixed bytes and str to {unified,context}_diff(). E.g.
1299    # without this check, passing filenames as bytes results in output like
1300    #   --- b'oldfile.txt'
1301    #   +++ b'newfile.txt'
1302    # because of how str.format() incorporates bytes objects.
1303    if a and not isinstance(a[0], str):
1304        raise TypeError('lines to compare must be str, not %s (%r)' %
1305                        (type(a[0]).__name__, a[0]))
1306    if b and not isinstance(b[0], str):
1307        raise TypeError('lines to compare must be str, not %s (%r)' %
1308                        (type(b[0]).__name__, b[0]))
1309    for arg in args:
1310        if not isinstance(arg, str):
1311            raise TypeError('all arguments must be str, not: %r' % (arg,))
1312
1313def diff_bytes(dfunc, a, b, fromfile=b'', tofile=b'',
1314               fromfiledate=b'', tofiledate=b'', n=3, lineterm=b'\n'):
1315    r"""
1316    Compare `a` and `b`, two sequences of lines represented as bytes rather
1317    than str. This is a wrapper for `dfunc`, which is typically either
1318    unified_diff() or context_diff(). Inputs are losslessly converted to
1319    strings so that `dfunc` only has to worry about strings, and encoded
1320    back to bytes on return. This is necessary to compare files with
1321    unknown or inconsistent encoding. All other inputs (except `n`) must be
1322    bytes rather than str.
1323    """
1324    def decode(s):
1325        try:
1326            return s.decode('ascii', 'surrogateescape')
1327        except AttributeError as err:
1328            msg = ('all arguments must be bytes, not %s (%r)' %
1329                   (type(s).__name__, s))
1330            raise TypeError(msg) from err
1331    a = list(map(decode, a))
1332    b = list(map(decode, b))
1333    fromfile = decode(fromfile)
1334    tofile = decode(tofile)
1335    fromfiledate = decode(fromfiledate)
1336    tofiledate = decode(tofiledate)
1337    lineterm = decode(lineterm)
1338
1339    lines = dfunc(a, b, fromfile, tofile, fromfiledate, tofiledate, n, lineterm)
1340    for line in lines:
1341        yield line.encode('ascii', 'surrogateescape')
1342
1343def ndiff(a, b, linejunk=None, charjunk=IS_CHARACTER_JUNK):
1344    r"""
1345    Compare `a` and `b` (lists of strings); return a `Differ`-style delta.
1346
1347    Optional keyword parameters `linejunk` and `charjunk` are for filter
1348    functions, or can be None:
1349
1350    - linejunk: A function that should accept a single string argument and
1351      return true iff the string is junk.  The default is None, and is
1352      recommended; the underlying SequenceMatcher class has an adaptive
1353      notion of "noise" lines.
1354
1355    - charjunk: A function that accepts a character (string of length
1356      1), and returns true iff the character is junk. The default is
1357      the module-level function IS_CHARACTER_JUNK, which filters out
1358      whitespace characters (a blank or tab; note: it's a bad idea to
1359      include newline in this!).
1360
1361    Tools/scripts/ndiff.py is a command-line front-end to this function.
1362
1363    Example:
1364
1365    >>> diff = ndiff('one\ntwo\nthree\n'.splitlines(keepends=True),
1366    ...              'ore\ntree\nemu\n'.splitlines(keepends=True))
1367    >>> print(''.join(diff), end="")
1368    - one
1369    ?  ^
1370    + ore
1371    ?  ^
1372    - two
1373    - three
1374    ?  -
1375    + tree
1376    + emu
1377    """
1378    return Differ(linejunk, charjunk).compare(a, b)
1379
1380def _mdiff(fromlines, tolines, context=None, linejunk=None,
1381           charjunk=IS_CHARACTER_JUNK):
1382    r"""Returns generator yielding marked up from/to side by side differences.
1383
1384    Arguments:
1385    fromlines -- list of text lines to compared to tolines
1386    tolines -- list of text lines to be compared to fromlines
1387    context -- number of context lines to display on each side of difference,
1388               if None, all from/to text lines will be generated.
1389    linejunk -- passed on to ndiff (see ndiff documentation)
1390    charjunk -- passed on to ndiff (see ndiff documentation)
1391
1392    This function returns an iterator which returns a tuple:
1393    (from line tuple, to line tuple, boolean flag)
1394
1395    from/to line tuple -- (line num, line text)
1396        line num -- integer or None (to indicate a context separation)
1397        line text -- original line text with following markers inserted:
1398            '\0+' -- marks start of added text
1399            '\0-' -- marks start of deleted text
1400            '\0^' -- marks start of changed text
1401            '\1' -- marks end of added/deleted/changed text
1402
1403    boolean flag -- None indicates context separation, True indicates
1404        either "from" or "to" line contains a change, otherwise False.
1405
1406    This function/iterator was originally developed to generate side by side
1407    file difference for making HTML pages (see HtmlDiff class for example
1408    usage).
1409
1410    Note, this function utilizes the ndiff function to generate the side by
1411    side difference markup.  Optional ndiff arguments may be passed to this
1412    function and they in turn will be passed to ndiff.
1413    """
1414    import re
1415
1416    # regular expression for finding intraline change indices
1417    change_re = re.compile(r'(\++|\-+|\^+)')
1418
1419    # create the difference iterator to generate the differences
1420    diff_lines_iterator = ndiff(fromlines,tolines,linejunk,charjunk)
1421
1422    def _make_line(lines, format_key, side, num_lines=[0,0]):
1423        """Returns line of text with user's change markup and line formatting.
1424
1425        lines -- list of lines from the ndiff generator to produce a line of
1426                 text from.  When producing the line of text to return, the
1427                 lines used are removed from this list.
1428        format_key -- '+' return first line in list with "add" markup around
1429                          the entire line.
1430                      '-' return first line in list with "delete" markup around
1431                          the entire line.
1432                      '?' return first line in list with add/delete/change
1433                          intraline markup (indices obtained from second line)
1434                      None return first line in list with no markup
1435        side -- indice into the num_lines list (0=from,1=to)
1436        num_lines -- from/to current line number.  This is NOT intended to be a
1437                     passed parameter.  It is present as a keyword argument to
1438                     maintain memory of the current line numbers between calls
1439                     of this function.
1440
1441        Note, this function is purposefully not defined at the module scope so
1442        that data it needs from its parent function (within whose context it
1443        is defined) does not need to be of module scope.
1444        """
1445        num_lines[side] += 1
1446        # Handle case where no user markup is to be added, just return line of
1447        # text with user's line format to allow for usage of the line number.
1448        if format_key is None:
1449            return (num_lines[side],lines.pop(0)[2:])
1450        # Handle case of intraline changes
1451        if format_key == '?':
1452            text, markers = lines.pop(0), lines.pop(0)
1453            # find intraline changes (store change type and indices in tuples)
1454            sub_info = []
1455            def record_sub_info(match_object,sub_info=sub_info):
1456                sub_info.append([match_object.group(1)[0],match_object.span()])
1457                return match_object.group(1)
1458            change_re.sub(record_sub_info,markers)
1459            # process each tuple inserting our special marks that won't be
1460            # noticed by an xml/html escaper.
1461            for key,(begin,end) in reversed(sub_info):
1462                text = text[0:begin]+'\0'+key+text[begin:end]+'\1'+text[end:]
1463            text = text[2:]
1464        # Handle case of add/delete entire line
1465        else:
1466            text = lines.pop(0)[2:]
1467            # if line of text is just a newline, insert a space so there is
1468            # something for the user to highlight and see.
1469            if not text:
1470                text = ' '
1471            # insert marks that won't be noticed by an xml/html escaper.
1472            text = '\0' + format_key + text + '\1'
1473        # Return line of text, first allow user's line formatter to do its
1474        # thing (such as adding the line number) then replace the special
1475        # marks with what the user's change markup.
1476        return (num_lines[side],text)
1477
1478    def _line_iterator():
1479        """Yields from/to lines of text with a change indication.
1480
1481        This function is an iterator.  It itself pulls lines from a
1482        differencing iterator, processes them and yields them.  When it can
1483        it yields both a "from" and a "to" line, otherwise it will yield one
1484        or the other.  In addition to yielding the lines of from/to text, a
1485        boolean flag is yielded to indicate if the text line(s) have
1486        differences in them.
1487
1488        Note, this function is purposefully not defined at the module scope so
1489        that data it needs from its parent function (within whose context it
1490        is defined) does not need to be of module scope.
1491        """
1492        lines = []
1493        num_blanks_pending, num_blanks_to_yield = 0, 0
1494        while True:
1495            # Load up next 4 lines so we can look ahead, create strings which
1496            # are a concatenation of the first character of each of the 4 lines
1497            # so we can do some very readable comparisons.
1498            while len(lines) < 4:
1499                lines.append(next(diff_lines_iterator, 'X'))
1500            s = ''.join([line[0] for line in lines])
1501            if s.startswith('X'):
1502                # When no more lines, pump out any remaining blank lines so the
1503                # corresponding add/delete lines get a matching blank line so
1504                # all line pairs get yielded at the next level.
1505                num_blanks_to_yield = num_blanks_pending
1506            elif s.startswith('-?+?'):
1507                # simple intraline change
1508                yield _make_line(lines,'?',0), _make_line(lines,'?',1), True
1509                continue
1510            elif s.startswith('--++'):
1511                # in delete block, add block coming: we do NOT want to get
1512                # caught up on blank lines yet, just process the delete line
1513                num_blanks_pending -= 1
1514                yield _make_line(lines,'-',0), None, True
1515                continue
1516            elif s.startswith(('--?+', '--+', '- ')):
1517                # in delete block and see an intraline change or unchanged line
1518                # coming: yield the delete line and then blanks
1519                from_line,to_line = _make_line(lines,'-',0), None
1520                num_blanks_to_yield,num_blanks_pending = num_blanks_pending-1,0
1521            elif s.startswith('-+?'):
1522                # intraline change
1523                yield _make_line(lines,None,0), _make_line(lines,'?',1), True
1524                continue
1525            elif s.startswith('-?+'):
1526                # intraline change
1527                yield _make_line(lines,'?',0), _make_line(lines,None,1), True
1528                continue
1529            elif s.startswith('-'):
1530                # delete FROM line
1531                num_blanks_pending -= 1
1532                yield _make_line(lines,'-',0), None, True
1533                continue
1534            elif s.startswith('+--'):
1535                # in add block, delete block coming: we do NOT want to get
1536                # caught up on blank lines yet, just process the add line
1537                num_blanks_pending += 1
1538                yield None, _make_line(lines,'+',1), True
1539                continue
1540            elif s.startswith(('+ ', '+-')):
1541                # will be leaving an add block: yield blanks then add line
1542                from_line, to_line = None, _make_line(lines,'+',1)
1543                num_blanks_to_yield,num_blanks_pending = num_blanks_pending+1,0
1544            elif s.startswith('+'):
1545                # inside an add block, yield the add line
1546                num_blanks_pending += 1
1547                yield None, _make_line(lines,'+',1), True
1548                continue
1549            elif s.startswith(' '):
1550                # unchanged text, yield it to both sides
1551                yield _make_line(lines[:],None,0),_make_line(lines,None,1),False
1552                continue
1553            # Catch up on the blank lines so when we yield the next from/to
1554            # pair, they are lined up.
1555            while(num_blanks_to_yield < 0):
1556                num_blanks_to_yield += 1
1557                yield None,('','\n'),True
1558            while(num_blanks_to_yield > 0):
1559                num_blanks_to_yield -= 1
1560                yield ('','\n'),None,True
1561            if s.startswith('X'):
1562                return
1563            else:
1564                yield from_line,to_line,True
1565
1566    def _line_pair_iterator():
1567        """Yields from/to lines of text with a change indication.
1568
1569        This function is an iterator.  It itself pulls lines from the line
1570        iterator.  Its difference from that iterator is that this function
1571        always yields a pair of from/to text lines (with the change
1572        indication).  If necessary it will collect single from/to lines
1573        until it has a matching pair from/to pair to yield.
1574
1575        Note, this function is purposefully not defined at the module scope so
1576        that data it needs from its parent function (within whose context it
1577        is defined) does not need to be of module scope.
1578        """
1579        line_iterator = _line_iterator()
1580        fromlines,tolines=[],[]
1581        while True:
1582            # Collecting lines of text until we have a from/to pair
1583            while (len(fromlines)==0 or len(tolines)==0):
1584                try:
1585                    from_line, to_line, found_diff = next(line_iterator)
1586                except StopIteration:
1587                    return
1588                if from_line is not None:
1589                    fromlines.append((from_line,found_diff))
1590                if to_line is not None:
1591                    tolines.append((to_line,found_diff))
1592            # Once we have a pair, remove them from the collection and yield it
1593            from_line, fromDiff = fromlines.pop(0)
1594            to_line, to_diff = tolines.pop(0)
1595            yield (from_line,to_line,fromDiff or to_diff)
1596
1597    # Handle case where user does not want context differencing, just yield
1598    # them up without doing anything else with them.
1599    line_pair_iterator = _line_pair_iterator()
1600    if context is None:
1601        yield from line_pair_iterator
1602    # Handle case where user wants context differencing.  We must do some
1603    # storage of lines until we know for sure that they are to be yielded.
1604    else:
1605        context += 1
1606        lines_to_write = 0
1607        while True:
1608            # Store lines up until we find a difference, note use of a
1609            # circular queue because we only need to keep around what
1610            # we need for context.
1611            index, contextLines = 0, [None]*(context)
1612            found_diff = False
1613            while(found_diff is False):
1614                try:
1615                    from_line, to_line, found_diff = next(line_pair_iterator)
1616                except StopIteration:
1617                    return
1618                i = index % context
1619                contextLines[i] = (from_line, to_line, found_diff)
1620                index += 1
1621            # Yield lines that we have collected so far, but first yield
1622            # the user's separator.
1623            if index > context:
1624                yield None, None, None
1625                lines_to_write = context
1626            else:
1627                lines_to_write = index
1628                index = 0
1629            while(lines_to_write):
1630                i = index % context
1631                index += 1
1632                yield contextLines[i]
1633                lines_to_write -= 1
1634            # Now yield the context lines after the change
1635            lines_to_write = context-1
1636            try:
1637                while(lines_to_write):
1638                    from_line, to_line, found_diff = next(line_pair_iterator)
1639                    # If another change within the context, extend the context
1640                    if found_diff:
1641                        lines_to_write = context-1
1642                    else:
1643                        lines_to_write -= 1
1644                    yield from_line, to_line, found_diff
1645            except StopIteration:
1646                # Catch exception from next() and return normally
1647                return
1648
1649
1650_file_template = """
1651<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
1652          "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
1653
1654<html>
1655
1656<head>
1657    <meta http-equiv="Content-Type"
1658          content="text/html; charset=%(charset)s" />
1659    <title></title>
1660    <style type="text/css">%(styles)s
1661    </style>
1662</head>
1663
1664<body>
1665    %(table)s%(legend)s
1666</body>
1667
1668</html>"""
1669
1670_styles = """
1671        table.diff {font-family:Courier; border:medium;}
1672        .diff_header {background-color:#e0e0e0}
1673        td.diff_header {text-align:right}
1674        .diff_next {background-color:#c0c0c0}
1675        .diff_add {background-color:#aaffaa}
1676        .diff_chg {background-color:#ffff77}
1677        .diff_sub {background-color:#ffaaaa}"""
1678
1679_table_template = """
1680    <table class="diff" id="difflib_chg_%(prefix)s_top"
1681           cellspacing="0" cellpadding="0" rules="groups" >
1682        <colgroup></colgroup> <colgroup></colgroup> <colgroup></colgroup>
1683        <colgroup></colgroup> <colgroup></colgroup> <colgroup></colgroup>
1684        %(header_row)s
1685        <tbody>
1686%(data_rows)s        </tbody>
1687    </table>"""
1688
1689_legend = """
1690    <table class="diff" summary="Legends">
1691        <tr> <th colspan="2"> Legends </th> </tr>
1692        <tr> <td> <table border="" summary="Colors">
1693                      <tr><th> Colors </th> </tr>
1694                      <tr><td class="diff_add">&nbsp;Added&nbsp;</td></tr>
1695                      <tr><td class="diff_chg">Changed</td> </tr>
1696                      <tr><td class="diff_sub">Deleted</td> </tr>
1697                  </table></td>
1698             <td> <table border="" summary="Links">
1699                      <tr><th colspan="2"> Links </th> </tr>
1700                      <tr><td>(f)irst change</td> </tr>
1701                      <tr><td>(n)ext change</td> </tr>
1702                      <tr><td>(t)op</td> </tr>
1703                  </table></td> </tr>
1704    </table>"""
1705
1706class HtmlDiff(object):
1707    """For producing HTML side by side comparison with change highlights.
1708
1709    This class can be used to create an HTML table (or a complete HTML file
1710    containing the table) showing a side by side, line by line comparison
1711    of text with inter-line and intra-line change highlights.  The table can
1712    be generated in either full or contextual difference mode.
1713
1714    The following methods are provided for HTML generation:
1715
1716    make_table -- generates HTML for a single side by side table
1717    make_file -- generates complete HTML file with a single side by side table
1718
1719    See tools/scripts/diff.py for an example usage of this class.
1720    """
1721
1722    _file_template = _file_template
1723    _styles = _styles
1724    _table_template = _table_template
1725    _legend = _legend
1726    _default_prefix = 0
1727
1728    def __init__(self,tabsize=8,wrapcolumn=None,linejunk=None,
1729                 charjunk=IS_CHARACTER_JUNK):
1730        """HtmlDiff instance initializer
1731
1732        Arguments:
1733        tabsize -- tab stop spacing, defaults to 8.
1734        wrapcolumn -- column number where lines are broken and wrapped,
1735            defaults to None where lines are not wrapped.
1736        linejunk,charjunk -- keyword arguments passed into ndiff() (used by
1737            HtmlDiff() to generate the side by side HTML differences).  See
1738            ndiff() documentation for argument default values and descriptions.
1739        """
1740        self._tabsize = tabsize
1741        self._wrapcolumn = wrapcolumn
1742        self._linejunk = linejunk
1743        self._charjunk = charjunk
1744
1745    def make_file(self, fromlines, tolines, fromdesc='', todesc='',
1746                  context=False, numlines=5, *, charset='utf-8'):
1747        """Returns HTML file of side by side comparison with change highlights
1748
1749        Arguments:
1750        fromlines -- list of "from" lines
1751        tolines -- list of "to" lines
1752        fromdesc -- "from" file column header string
1753        todesc -- "to" file column header string
1754        context -- set to True for contextual differences (defaults to False
1755            which shows full differences).
1756        numlines -- number of context lines.  When context is set True,
1757            controls number of lines displayed before and after the change.
1758            When context is False, controls the number of lines to place
1759            the "next" link anchors before the next change (so click of
1760            "next" link jumps to just before the change).
1761        charset -- charset of the HTML document
1762        """
1763
1764        return (self._file_template % dict(
1765            styles=self._styles,
1766            legend=self._legend,
1767            table=self.make_table(fromlines, tolines, fromdesc, todesc,
1768                                  context=context, numlines=numlines),
1769            charset=charset
1770        )).encode(charset, 'xmlcharrefreplace').decode(charset)
1771
1772    def _tab_newline_replace(self,fromlines,tolines):
1773        """Returns from/to line lists with tabs expanded and newlines removed.
1774
1775        Instead of tab characters being replaced by the number of spaces
1776        needed to fill in to the next tab stop, this function will fill
1777        the space with tab characters.  This is done so that the difference
1778        algorithms can identify changes in a file when tabs are replaced by
1779        spaces and vice versa.  At the end of the HTML generation, the tab
1780        characters will be replaced with a nonbreakable space.
1781        """
1782        def expand_tabs(line):
1783            # hide real spaces
1784            line = line.replace(' ','\0')
1785            # expand tabs into spaces
1786            line = line.expandtabs(self._tabsize)
1787            # replace spaces from expanded tabs back into tab characters
1788            # (we'll replace them with markup after we do differencing)
1789            line = line.replace(' ','\t')
1790            return line.replace('\0',' ').rstrip('\n')
1791        fromlines = [expand_tabs(line) for line in fromlines]
1792        tolines = [expand_tabs(line) for line in tolines]
1793        return fromlines,tolines
1794
1795    def _split_line(self,data_list,line_num,text):
1796        """Builds list of text lines by splitting text lines at wrap point
1797
1798        This function will determine if the input text line needs to be
1799        wrapped (split) into separate lines.  If so, the first wrap point
1800        will be determined and the first line appended to the output
1801        text line list.  This function is used recursively to handle
1802        the second part of the split line to further split it.
1803        """
1804        # if blank line or context separator, just add it to the output list
1805        if not line_num:
1806            data_list.append((line_num,text))
1807            return
1808
1809        # if line text doesn't need wrapping, just add it to the output list
1810        size = len(text)
1811        max = self._wrapcolumn
1812        if (size <= max) or ((size -(text.count('\0')*3)) <= max):
1813            data_list.append((line_num,text))
1814            return
1815
1816        # scan text looking for the wrap point, keeping track if the wrap
1817        # point is inside markers
1818        i = 0
1819        n = 0
1820        mark = ''
1821        while n < max and i < size:
1822            if text[i] == '\0':
1823                i += 1
1824                mark = text[i]
1825                i += 1
1826            elif text[i] == '\1':
1827                i += 1
1828                mark = ''
1829            else:
1830                i += 1
1831                n += 1
1832
1833        # wrap point is inside text, break it up into separate lines
1834        line1 = text[:i]
1835        line2 = text[i:]
1836
1837        # if wrap point is inside markers, place end marker at end of first
1838        # line and start marker at beginning of second line because each
1839        # line will have its own table tag markup around it.
1840        if mark:
1841            line1 = line1 + '\1'
1842            line2 = '\0' + mark + line2
1843
1844        # tack on first line onto the output list
1845        data_list.append((line_num,line1))
1846
1847        # use this routine again to wrap the remaining text
1848        self._split_line(data_list,'>',line2)
1849
1850    def _line_wrapper(self,diffs):
1851        """Returns iterator that splits (wraps) mdiff text lines"""
1852
1853        # pull from/to data and flags from mdiff iterator
1854        for fromdata,todata,flag in diffs:
1855            # check for context separators and pass them through
1856            if flag is None:
1857                yield fromdata,todata,flag
1858                continue
1859            (fromline,fromtext),(toline,totext) = fromdata,todata
1860            # for each from/to line split it at the wrap column to form
1861            # list of text lines.
1862            fromlist,tolist = [],[]
1863            self._split_line(fromlist,fromline,fromtext)
1864            self._split_line(tolist,toline,totext)
1865            # yield from/to line in pairs inserting blank lines as
1866            # necessary when one side has more wrapped lines
1867            while fromlist or tolist:
1868                if fromlist:
1869                    fromdata = fromlist.pop(0)
1870                else:
1871                    fromdata = ('',' ')
1872                if tolist:
1873                    todata = tolist.pop(0)
1874                else:
1875                    todata = ('',' ')
1876                yield fromdata,todata,flag
1877
1878    def _collect_lines(self,diffs):
1879        """Collects mdiff output into separate lists
1880
1881        Before storing the mdiff from/to data into a list, it is converted
1882        into a single line of text with HTML markup.
1883        """
1884
1885        fromlist,tolist,flaglist = [],[],[]
1886        # pull from/to data and flags from mdiff style iterator
1887        for fromdata,todata,flag in diffs:
1888            try:
1889                # store HTML markup of the lines into the lists
1890                fromlist.append(self._format_line(0,flag,*fromdata))
1891                tolist.append(self._format_line(1,flag,*todata))
1892            except TypeError:
1893                # exceptions occur for lines where context separators go
1894                fromlist.append(None)
1895                tolist.append(None)
1896            flaglist.append(flag)
1897        return fromlist,tolist,flaglist
1898
1899    def _format_line(self,side,flag,linenum,text):
1900        """Returns HTML markup of "from" / "to" text lines
1901
1902        side -- 0 or 1 indicating "from" or "to" text
1903        flag -- indicates if difference on line
1904        linenum -- line number (used for line number column)
1905        text -- line text to be marked up
1906        """
1907        try:
1908            linenum = '%d' % linenum
1909            id = ' id="%s%s"' % (self._prefix[side],linenum)
1910        except TypeError:
1911            # handle blank lines where linenum is '>' or ''
1912            id = ''
1913        # replace those things that would get confused with HTML symbols
1914        text=text.replace("&","&amp;").replace(">","&gt;").replace("<","&lt;")
1915
1916        # make space non-breakable so they don't get compressed or line wrapped
1917        text = text.replace(' ','&nbsp;').rstrip()
1918
1919        return '<td class="diff_header"%s>%s</td><td nowrap="nowrap">%s</td>' \
1920               % (id,linenum,text)
1921
1922    def _make_prefix(self):
1923        """Create unique anchor prefixes"""
1924
1925        # Generate a unique anchor prefix so multiple tables
1926        # can exist on the same HTML page without conflicts.
1927        fromprefix = "from%d_" % HtmlDiff._default_prefix
1928        toprefix = "to%d_" % HtmlDiff._default_prefix
1929        HtmlDiff._default_prefix += 1
1930        # store prefixes so line format method has access
1931        self._prefix = [fromprefix,toprefix]
1932
1933    def _convert_flags(self,fromlist,tolist,flaglist,context,numlines):
1934        """Makes list of "next" links"""
1935
1936        # all anchor names will be generated using the unique "to" prefix
1937        toprefix = self._prefix[1]
1938
1939        # process change flags, generating middle column of next anchors/links
1940        next_id = ['']*len(flaglist)
1941        next_href = ['']*len(flaglist)
1942        num_chg, in_change = 0, False
1943        last = 0
1944        for i,flag in enumerate(flaglist):
1945            if flag:
1946                if not in_change:
1947                    in_change = True
1948                    last = i
1949                    # at the beginning of a change, drop an anchor a few lines
1950                    # (the context lines) before the change for the previous
1951                    # link
1952                    i = max([0,i-numlines])
1953                    next_id[i] = ' id="difflib_chg_%s_%d"' % (toprefix,num_chg)
1954                    # at the beginning of a change, drop a link to the next
1955                    # change
1956                    num_chg += 1
1957                    next_href[last] = '<a href="#difflib_chg_%s_%d">n</a>' % (
1958                         toprefix,num_chg)
1959            else:
1960                in_change = False
1961        # check for cases where there is no content to avoid exceptions
1962        if not flaglist:
1963            flaglist = [False]
1964            next_id = ['']
1965            next_href = ['']
1966            last = 0
1967            if context:
1968                fromlist = ['<td></td><td>&nbsp;No Differences Found&nbsp;</td>']
1969                tolist = fromlist
1970            else:
1971                fromlist = tolist = ['<td></td><td>&nbsp;Empty File&nbsp;</td>']
1972        # if not a change on first line, drop a link
1973        if not flaglist[0]:
1974            next_href[0] = '<a href="#difflib_chg_%s_0">f</a>' % toprefix
1975        # redo the last link to link to the top
1976        next_href[last] = '<a href="#difflib_chg_%s_top">t</a>' % (toprefix)
1977
1978        return fromlist,tolist,flaglist,next_href,next_id
1979
1980    def make_table(self,fromlines,tolines,fromdesc='',todesc='',context=False,
1981                   numlines=5):
1982        """Returns HTML table of side by side comparison with change highlights
1983
1984        Arguments:
1985        fromlines -- list of "from" lines
1986        tolines -- list of "to" lines
1987        fromdesc -- "from" file column header string
1988        todesc -- "to" file column header string
1989        context -- set to True for contextual differences (defaults to False
1990            which shows full differences).
1991        numlines -- number of context lines.  When context is set True,
1992            controls number of lines displayed before and after the change.
1993            When context is False, controls the number of lines to place
1994            the "next" link anchors before the next change (so click of
1995            "next" link jumps to just before the change).
1996        """
1997
1998        # make unique anchor prefixes so that multiple tables may exist
1999        # on the same page without conflict.
2000        self._make_prefix()
2001
2002        # change tabs to spaces before it gets more difficult after we insert
2003        # markup
2004        fromlines,tolines = self._tab_newline_replace(fromlines,tolines)
2005
2006        # create diffs iterator which generates side by side from/to data
2007        if context:
2008            context_lines = numlines
2009        else:
2010            context_lines = None
2011        diffs = _mdiff(fromlines,tolines,context_lines,linejunk=self._linejunk,
2012                      charjunk=self._charjunk)
2013
2014        # set up iterator to wrap lines that exceed desired width
2015        if self._wrapcolumn:
2016            diffs = self._line_wrapper(diffs)
2017
2018        # collect up from/to lines and flags into lists (also format the lines)
2019        fromlist,tolist,flaglist = self._collect_lines(diffs)
2020
2021        # process change flags, generating middle column of next anchors/links
2022        fromlist,tolist,flaglist,next_href,next_id = self._convert_flags(
2023            fromlist,tolist,flaglist,context,numlines)
2024
2025        s = []
2026        fmt = '            <tr><td class="diff_next"%s>%s</td>%s' + \
2027              '<td class="diff_next">%s</td>%s</tr>\n'
2028        for i in range(len(flaglist)):
2029            if flaglist[i] is None:
2030                # mdiff yields None on separator lines skip the bogus ones
2031                # generated for the first line
2032                if i > 0:
2033                    s.append('        </tbody>        \n        <tbody>\n')
2034            else:
2035                s.append( fmt % (next_id[i],next_href[i],fromlist[i],
2036                                           next_href[i],tolist[i]))
2037        if fromdesc or todesc:
2038            header_row = '<thead><tr>%s%s%s%s</tr></thead>' % (
2039                '<th class="diff_next"><br /></th>',
2040                '<th colspan="2" class="diff_header">%s</th>' % fromdesc,
2041                '<th class="diff_next"><br /></th>',
2042                '<th colspan="2" class="diff_header">%s</th>' % todesc)
2043        else:
2044            header_row = ''
2045
2046        table = self._table_template % dict(
2047            data_rows=''.join(s),
2048            header_row=header_row,
2049            prefix=self._prefix[1])
2050
2051        return table.replace('\0+','<span class="diff_add">'). \
2052                     replace('\0-','<span class="diff_sub">'). \
2053                     replace('\0^','<span class="diff_chg">'). \
2054                     replace('\1','</span>'). \
2055                     replace('\t','&nbsp;')
2056
2057del re
2058
2059def restore(delta, which):
2060    r"""
2061    Generate one of the two sequences that generated a delta.
2062
2063    Given a `delta` produced by `Differ.compare()` or `ndiff()`, extract
2064    lines originating from file 1 or 2 (parameter `which`), stripping off line
2065    prefixes.
2066
2067    Examples:
2068
2069    >>> diff = ndiff('one\ntwo\nthree\n'.splitlines(keepends=True),
2070    ...              'ore\ntree\nemu\n'.splitlines(keepends=True))
2071    >>> diff = list(diff)
2072    >>> print(''.join(restore(diff, 1)), end="")
2073    one
2074    two
2075    three
2076    >>> print(''.join(restore(diff, 2)), end="")
2077    ore
2078    tree
2079    emu
2080    """
2081    try:
2082        tag = {1: "- ", 2: "+ "}[int(which)]
2083    except KeyError:
2084        raise ValueError('unknown delta choice (must be 1 or 2): %r'
2085                           % which) from None
2086    prefixes = ("  ", tag)
2087    for line in delta:
2088        if line[:2] in prefixes:
2089            yield line[2:]
2090
2091def _test():
2092    import doctest, difflib
2093    return doctest.testmod(difflib)
2094
2095if __name__ == "__main__":
2096    _test()
2097