• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 package jdiff;
2 
3 import java.io.*;
4 import java.util.*;
5 
6 /** A class to compare vectors of objects.  The result of comparison
7     is a list of <code>change</code> objects which form an
8     edit script.  The objects compared are traditionally lines
9     of text from two files.  Comparison options such as "ignore
10     whitespace" are implemented by modifying the <code>equals</code>
11     and <code>hashcode</code> methods for the objects compared.
12 <p>
13    The basic algorithm is described in: </br>
14    "An O(ND) Difference Algorithm and its Variations", Eugene Myers,
15    Algorithmica Vol. 1 No. 2, 1986, p 251.
16 <p>
17    This class outputs different results from GNU diff 1.15 on some
18    inputs.  Our results are actually better (smaller change list, smaller
19    total size of changes), but it would be nice to know why.  Perhaps
20    there is a memory overwrite bug in GNU diff 1.15.
21 
22   @author Stuart D. Gathman, translated from GNU diff 1.15
23     Copyright (C) 2000  Business Management Systems, Inc.
24 <p>
25     This program is free software; you can redistribute it and/or modify
26     it under the terms of the GNU General Public License as published by
27     the Free Software Foundation; either version 1, or (at your option)
28     any later version.
29 <p>
30     This program is distributed in the hope that it will be useful,
31     but WITHOUT ANY WARRANTY; without even the implied warranty of
32     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
33     GNU General Public License for more details.
34 <p>
35     You should have received a copy of the <a href=COPYING.txt>
36     GNU General Public License</a>
37     along with this program; if not, write to the Free Software
38     Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
39 
40  */
41 
42 public class DiffMyers
43 {
44 
45   /** Prepare to find differences between two arrays.  Each element of
46       the arrays is translated to an "equivalence number" based on
47       the result of <code>equals</code>.  The original Object arrays
48       are no longer needed for computing the differences.  They will
49       be needed again later to print the results of the comparison as
50       an edit script, if desired.
51    */
DiffMyers(Object[] a,Object[] b)52   public DiffMyers(Object[] a,Object[] b)
53   {
54     Hashtable h = new Hashtable(a.length + b.length);
55     filevec[0] = new file_data(a,h);
56     filevec[1] = new file_data(b,h);
57   }
58 
59   /** 1 more than the maximum equivalence value used for this or its
60      sibling file. */
61   private int equiv_max = 1;
62 
63   /** When set to true, the comparison uses a heuristic to speed it up.
64     With this heuristic, for files with a constant small density
65     of changes, the algorithm is linear in the file size.  */
66   public boolean heuristic = false;
67 
68   /** When set to true, the algorithm returns a guarranteed minimal
69       set of changes.  This makes things slower, sometimes much slower. */
70   public boolean no_discards = false;
71 
72   private int[] xvec, yvec;        /* Vectors being compared. */
73   private int[] fdiag;                /* Vector, indexed by diagonal, containing
74                                    the X coordinate of the point furthest
75                                    along the given diagonal in the forward
76                                    search of the edit matrix. */
77   private int[] bdiag;                /* Vector, indexed by diagonal, containing
78                                    the X coordinate of the point furthest
79                                    along the given diagonal in the backward
80                                    search of the edit matrix. */
81   private int fdiagoff, bdiagoff;
82   private final file_data[] filevec = new file_data[2];
83   private int cost;
84 
85   /** Find the midpoint of the shortest edit script for a specified
86      portion of the two files.
87 
88      We scan from the beginnings of the files, and simultaneously from the ends,
89      doing a breadth-first search through the space of edit-sequence.
90      When the two searches meet, we have found the midpoint of the shortest
91      edit sequence.
92 
93      The value returned is the number of the diagonal on which the midpoint lies.
94      The diagonal number equals the number of inserted lines minus the number
95      of deleted lines (counting only lines before the midpoint).
96      The edit cost is stored into COST; this is the total number of
97      lines inserted or deleted (counting only lines before the midpoint).
98 
99      This function assumes that the first lines of the specified portions
100      of the two files do not match, and likewise that the last lines do not
101      match.  The caller must trim matching lines from the beginning and end
102      of the portions it is going to specify.
103 
104      Note that if we return the "wrong" diagonal value, or if
105      the value of bdiag at that diagonal is "wrong",
106      the worst this can do is cause suboptimal diff output.
107      It cannot cause incorrect diff output.  */
108 
diag(int xoff, int xlim, int yoff, int ylim)109   private int diag (int xoff, int xlim, int yoff, int ylim)
110   {
111     final int[] fd = fdiag;        // Give the compiler a chance.
112     final int[] bd = bdiag;        // Additional help for the compiler.
113     final int[] xv = xvec;                // Still more help for the compiler.
114     final int[] yv = yvec;                // And more and more . . .
115     final int dmin = xoff - ylim;        // Minimum valid diagonal.
116     final int dmax = xlim - yoff;        // Maximum valid diagonal.
117     final int fmid = xoff - yoff;        // Center diagonal of top-down search.
118     final int bmid = xlim - ylim;        // Center diagonal of bottom-up search.
119     int fmin = fmid, fmax = fmid;        // Limits of top-down search.
120     int bmin = bmid, bmax = bmid;        // Limits of bottom-up search.
121     /* True if southeast corner is on an odd
122                                      diagonal with respect to the northwest. */
123     final boolean odd = (fmid - bmid & 1) != 0;
124 
125     fd[fdiagoff + fmid] = xoff;
126     bd[bdiagoff + bmid] = xlim;
127 
128     for (int c = 1;; ++c)
129       {
130         int d;                        /* Active diagonal. */
131         boolean big_snake = false;
132 
133         /* Extend the top-down search by an edit step in each diagonal. */
134         if (fmin > dmin)
135           fd[fdiagoff + --fmin - 1] = -1;
136         else
137           ++fmin;
138         if (fmax < dmax)
139           fd[fdiagoff + ++fmax + 1] = -1;
140         else
141           --fmax;
142         for (d = fmax; d >= fmin; d -= 2)
143           {
144             int x, y, oldx, tlo = fd[fdiagoff + d - 1], thi = fd[fdiagoff + d + 1];
145 
146             if (tlo >= thi)
147               x = tlo + 1;
148             else
149               x = thi;
150             oldx = x;
151             y = x - d;
152             while (x < xlim && y < ylim && xv[x] == yv[y]) {
153               ++x; ++y;
154             }
155             if (x - oldx > 20)
156               big_snake = true;
157             fd[fdiagoff + d] = x;
158             if (odd && bmin <= d && d <= bmax && bd[bdiagoff + d] <= fd[fdiagoff + d])
159               {
160                 cost = 2 * c - 1;
161                 return d;
162               }
163           }
164 
165         /* Similar extend the bottom-up search. */
166         if (bmin > dmin)
167           bd[bdiagoff + --bmin - 1] = Integer.MAX_VALUE;
168         else
169           ++bmin;
170         if (bmax < dmax)
171           bd[bdiagoff + ++bmax + 1] = Integer.MAX_VALUE;
172         else
173           --bmax;
174         for (d = bmax; d >= bmin; d -= 2)
175           {
176             int x, y, oldx, tlo = bd[bdiagoff + d - 1], thi = bd[bdiagoff + d + 1];
177 
178             if (tlo < thi)
179               x = tlo;
180             else
181               x = thi - 1;
182             oldx = x;
183             y = x - d;
184             while (x > xoff && y > yoff && xv[x - 1] == yv[y - 1]) {
185               --x; --y;
186             }
187             if (oldx - x > 20)
188               big_snake = true;
189             bd[bdiagoff + d] = x;
190             if (!odd && fmin <= d && d <= fmax && bd[bdiagoff + d] <= fd[fdiagoff + d])
191               {
192                 cost = 2 * c;
193                 return d;
194               }
195           }
196 
197         /* Heuristic: check occasionally for a diagonal that has made
198            lots of progress compared with the edit distance.
199            If we have any such, find the one that has made the most
200            progress and return it as if it had succeeded.
201 
202            With this heuristic, for files with a constant small density
203            of changes, the algorithm is linear in the file size.  */
204 
205         if (c > 200 && big_snake && heuristic)
206           {
207             int best = 0;
208             int bestpos = -1;
209 
210             for (d = fmax; d >= fmin; d -= 2)
211               {
212                 int dd = d - fmid;
213                 if ((fd[fdiagoff + d] - xoff)*2 - dd > 12 * (c + (dd > 0 ? dd : -dd)))
214                   {
215                     if (fd[fdiagoff + d] * 2 - dd > best
216                         && fd[fdiagoff + d] - xoff > 20
217                         && fd[fdiagoff + d] - d - yoff > 20)
218                       {
219                         int k;
220                         int x = fd[fdiagoff + d];
221 
222                         /* We have a good enough best diagonal;
223                            now insist that it end with a significant snake.  */
224                         for (k = 1; k <= 20; k++)
225                           if (xvec[x - k] != yvec[x - d - k])
226                             break;
227 
228                         if (k == 21)
229                           {
230                             best = fd[fdiagoff + d] * 2 - dd;
231                             bestpos = d;
232                           }
233                       }
234                   }
235               }
236             if (best > 0)
237               {
238                 cost = 2 * c - 1;
239                 return bestpos;
240               }
241 
242             best = 0;
243             for (d = bmax; d >= bmin; d -= 2)
244               {
245                 int dd = d - bmid;
246                 if ((xlim - bd[bdiagoff + d])*2 + dd > 12 * (c + (dd > 0 ? dd : -dd)))
247                   {
248                     if ((xlim - bd[bdiagoff + d]) * 2 + dd > best
249                         && xlim - bd[bdiagoff + d] > 20
250                         && ylim - (bd[bdiagoff + d] - d) > 20)
251                       {
252                         /* We have a good enough best diagonal;
253                            now insist that it end with a significant snake.  */
254                         int k;
255                         int x = bd[bdiagoff + d];
256 
257                         for (k = 0; k < 20; k++)
258                           if (xvec[x + k] != yvec[x - d + k])
259                             break;
260                         if (k == 20)
261                           {
262                             best = (xlim - bd[bdiagoff + d]) * 2 + dd;
263                             bestpos = d;
264                           }
265                       }
266                   }
267               }
268             if (best > 0)
269               {
270                 cost = 2 * c - 1;
271                 return bestpos;
272               }
273           }
274       }
275   }
276 
277   /** Compare in detail contiguous subsequences of the two files
278      which are known, as a whole, to match each other.
279 
280      The results are recorded in the vectors filevec[N].changed_flag, by
281      storing a 1 in the element for each line that is an insertion or deletion.
282 
283      The subsequence of file 0 is [XOFF, XLIM) and likewise for file 1.
284 
285      Note that XLIM, YLIM are exclusive bounds.
286      All line numbers are origin-0 and discarded lines are not counted.  */
287 
compareseq(int xoff, int xlim, int yoff, int ylim)288   private void compareseq (int xoff, int xlim, int yoff, int ylim) {
289     /* Slide down the bottom initial diagonal. */
290     while (xoff < xlim && yoff < ylim && xvec[xoff] == yvec[yoff]) {
291       ++xoff; ++yoff;
292     }
293     /* Slide up the top initial diagonal. */
294     while (xlim > xoff && ylim > yoff && xvec[xlim - 1] == yvec[ylim - 1]) {
295       --xlim; --ylim;
296     }
297 
298     /* Handle simple cases. */
299     if (xoff == xlim)
300       while (yoff < ylim)
301         filevec[1].changed_flag[1+filevec[1].realindexes[yoff++]] = true;
302     else if (yoff == ylim)
303       while (xoff < xlim)
304         filevec[0].changed_flag[1+filevec[0].realindexes[xoff++]] = true;
305     else
306       {
307         /* Find a point of correspondence in the middle of the files.  */
308 
309         int d = diag (xoff, xlim, yoff, ylim);
310         int c = cost;
311         int f = fdiag[fdiagoff + d];
312         int b = bdiag[bdiagoff + d];
313 
314         if (c == 1)
315           {
316             /* This should be impossible, because it implies that
317                one of the two subsequences is empty,
318                and that case was handled above without calling `diag'.
319                Let's verify that this is true.  */
320             throw new IllegalArgumentException("Empty subsequence");
321           }
322         else
323           {
324             /* Use that point to split this problem into two subproblems.  */
325             compareseq (xoff, b, yoff, b - d);
326             /* This used to use f instead of b,
327                but that is incorrect!
328                It is not necessarily the case that diagonal d
329                has a snake from b to f.  */
330             compareseq (b, xlim, b - d, ylim);
331           }
332       }
333   }
334 
335   /** Discard lines from one file that have no matches in the other file.
336    */
337 
discard_confusing_lines()338   private void discard_confusing_lines() {
339     filevec[0].discard_confusing_lines(filevec[1]);
340     filevec[1].discard_confusing_lines(filevec[0]);
341   }
342 
343   private boolean inhibit = false;
344 
345   /** Adjust inserts/deletes of blank lines to join changes
346      as much as possible.
347    */
348 
shift_boundaries()349   private void shift_boundaries() {
350     if (inhibit)
351       return;
352     filevec[0].shift_boundaries(filevec[1]);
353     filevec[1].shift_boundaries(filevec[0]);
354   }
355 
356   /** Scan the tables of which lines are inserted and deleted,
357      producing an edit script in reverse order.  */
358 
build_reverse_script()359   private change build_reverse_script() {
360     change script = null;
361     final boolean[] changed0 = filevec[0].changed_flag;
362     final boolean[] changed1 = filevec[1].changed_flag;
363     final int len0 = filevec[0].buffered_lines;
364     final int len1 = filevec[1].buffered_lines;
365 
366     /* Note that changedN[len0] does exist, and contains 0.  */
367 
368     int i0 = 0, i1 = 0;
369 
370     while (i0 < len0 || i1 < len1)
371       {
372         if (changed0[1+i0] || changed1[1+i1])
373           {
374             int line0 = i0, line1 = i1;
375 
376             /* Find # lines changed here in each file.  */
377             while (changed0[1+i0]) ++i0;
378             while (changed1[1+i1]) ++i1;
379 
380             /* Record this change.  */
381             script = new change(line0, line1, i0 - line0, i1 - line1, script);
382           }
383 
384         /* We have reached lines in the two files that match each other.  */
385         i0++; i1++;
386       }
387 
388     return script;
389   }
390 
391   /** Scan the tables of which lines are inserted and deleted,
392      producing an edit script in forward order.  */
393 
build_script()394   private change build_script() {
395     change script = null;
396     final boolean[] changed0 = filevec[0].changed_flag;
397     final boolean[] changed1 = filevec[1].changed_flag;
398     final int len0 = filevec[0].buffered_lines;
399     final int len1 = filevec[1].buffered_lines;
400     int i0 = len0, i1 = len1;
401 
402     /* Note that changedN[-1] does exist, and contains 0.  */
403 
404     while (i0 >= 0 || i1 >= 0)
405       {
406         if (changed0[i0] || changed1[i1])
407           {
408             int line0 = i0, line1 = i1;
409 
410             /* Find # lines changed here in each file.  */
411             while (changed0[i0]) --i0;
412             while (changed1[i1]) --i1;
413 
414             /* Record this change.  */
415             script = new change(i0, i1, line0 - i0, line1 - i1, script);
416           }
417 
418         /* We have reached lines in the two files that match each other.  */
419         i0--; i1--;
420       }
421 
422     return script;
423   }
424 
425   /* Report the differences of two files.  DEPTH is the current directory
426      depth. */
diff_2(final boolean reverse)427   public change diff_2(final boolean reverse) {
428 
429     /* Some lines are obviously insertions or deletions
430        because they don't match anything.  Detect them now,
431        and avoid even thinking about them in the main comparison algorithm.  */
432 
433     discard_confusing_lines ();
434 
435     /* Now do the main comparison algorithm, considering just the
436        undiscarded lines.  */
437 
438     xvec = filevec[0].undiscarded;
439     yvec = filevec[1].undiscarded;
440 
441     int diags =
442       filevec[0].nondiscarded_lines + filevec[1].nondiscarded_lines + 3;
443     fdiag = new int[diags];
444     fdiagoff = filevec[1].nondiscarded_lines + 1;
445     bdiag = new int[diags];
446     bdiagoff = filevec[1].nondiscarded_lines + 1;
447 
448     compareseq (0, filevec[0].nondiscarded_lines,
449                 0, filevec[1].nondiscarded_lines);
450     fdiag = null;
451     bdiag = null;
452 
453     /* Modify the results slightly to make them prettier
454        in cases where that can validly be done.  */
455 
456     shift_boundaries ();
457 
458     /* Get the results of comparison in the form of a chain
459        of `struct change's -- an edit script.  */
460 
461     if (reverse)
462       return build_reverse_script();
463     else
464       return build_script();
465   }
466 
467   /** The result of comparison is an "edit script": a chain of change objects.
468      Each change represents one place where some lines are deleted
469      and some are inserted.
470 
471      LINE0 and LINE1 are the first affected lines in the two files (origin 0).
472      DELETED is the number of lines deleted here from file 0.
473      INSERTED is the number of lines inserted here in file 1.
474 
475      If DELETED is 0 then LINE0 is the number of the line before
476      which the insertion was done; vice versa for INSERTED and LINE1.  */
477 
478   public static class change {
479     /** Previous or next edit command. */
480     public change link;
481     /** # lines of file 1 changed here.  */
482     public int inserted;
483     /** # lines of file 0 changed here.  */
484     public int deleted;
485     /** Line number of 1st deleted line.  */
486     public final int line0;
487     /** Line number of 1st inserted line.  */
488     public final int line1;
489 
490     /** Cons an additional entry onto the front of an edit script OLD.
491        LINE0 and LINE1 are the first affected lines in the two files (origin 0).
492        DELETED is the number of lines deleted here from file 0.
493        INSERTED is the number of lines inserted here in file 1.
494 
495        If DELETED is 0 then LINE0 is the number of the line before
496        which the insertion was done; vice versa for INSERTED and LINE1.  */
change(int line0, int line1, int deleted, int inserted, change old)497     change(int line0, int line1, int deleted, int inserted, change old) {
498       this.line0 = line0;
499       this.line1 = line1;
500       this.inserted = inserted;
501       this.deleted = deleted;
502       this.link = old;
503       //System.err.println(line0+","+line1+","+inserted+","+deleted);
504     }
505   }
506 
507   /** Data on one input file being compared.
508    */
509 
510   class file_data {
511 
512     /** Allocate changed array for the results of comparison.  */
clear()513     void clear() {
514       /* Allocate a flag for each line of each file, saying whether that line
515          is an insertion or deletion.
516          Allocate an extra element, always zero, at each end of each vector.
517        */
518       changed_flag = new boolean[buffered_lines + 2];
519     }
520 
521     /** Return equiv_count[I] as the number of lines in this file
522        that fall in equivalence class I.
523          @return the array of equivalence class counts.
524      */
equivCount()525     int[] equivCount() {
526       int[] equiv_count = new int[equiv_max];
527       for (int i = 0; i < buffered_lines; ++i)
528         ++equiv_count[equivs[i]];
529       return equiv_count;
530     }
531 
532     /** Discard lines that have no matches in another file.
533 
534        A line which is discarded will not be considered by the actual
535        comparison algorithm; it will be as if that line were not in the file.
536        The file's `realindexes' table maps virtual line numbers
537        (which don't count the discarded lines) into real line numbers;
538        this is how the actual comparison algorithm produces results
539        that are comprehensible when the discarded lines are counted.
540 <p>
541        When we discard a line, we also mark it as a deletion or insertion
542        so that it will be printed in the output.
543       @param f the other file
544      */
discard_confusing_lines(file_data f)545     void discard_confusing_lines(file_data f) {
546       clear();
547     /* Set up table of which lines are going to be discarded. */
548       final byte[] discarded = discardable(f.equivCount());
549 
550     /* Don't really discard the provisional lines except when they occur
551        in a run of discardables, with nonprovisionals at the beginning
552        and end.  */
553       filterDiscards(discarded);
554 
555     /* Actually discard the lines. */
556       discard(discarded);
557     }
558 
559     /** Mark to be discarded each line that matches no line of another file.
560        If a line matches many lines, mark it as provisionally discardable.
561        @see equivCount()
562        @param counts The count of each equivalence number for the other file.
563        @return 0=nondiscardable, 1=discardable or 2=provisionally discardable
564                for each line
565      */
566 
discardable(final int[] counts)567     private byte[] discardable(final int[] counts) {
568       final int end = buffered_lines;
569       final byte[] discards = new byte[end];
570       final int[] equivs = this.equivs;
571       int many = 5;
572       int tem = end / 64;
573 
574       /* Multiply MANY by approximate square root of number of lines.
575          That is the threshold for provisionally discardable lines.  */
576       while ((tem = tem >> 2) > 0)
577         many *= 2;
578 
579       for (int i = 0; i < end; i++)
580         {
581           int nmatch;
582           if (equivs[i] == 0)
583             continue;
584           nmatch = counts[equivs[i]];
585           if (nmatch == 0)
586             discards[i] = 1;
587           else if (nmatch > many)
588             discards[i] = 2;
589         }
590       return discards;
591     }
592 
593     /** Don't really discard the provisional lines except when they occur
594        in a run of discardables, with nonprovisionals at the beginning
595        and end.  */
596 
filterDiscards(final byte[] discards)597     private void filterDiscards(final byte[] discards) {
598         final int end = buffered_lines;
599 
600         for (int i = 0; i < end; i++)
601           {
602             /* Cancel provisional discards not in middle of run of discards.  */
603             if (discards[i] == 2)
604               discards[i] = 0;
605             else if (discards[i] != 0)
606               {
607                 /* We have found a nonprovisional discard.  */
608                 int j;
609                 int length;
610                 int provisional = 0;
611 
612                 /* Find end of this run of discardable lines.
613                    Count how many are provisionally discardable.  */
614                 for (j = i; j < end; j++)
615                   {
616                     if (discards[j] == 0)
617                       break;
618                     if (discards[j] == 2)
619                       ++provisional;
620                   }
621 
622                 /* Cancel provisional discards at end, and shrink the run.  */
623                 while (j > i && discards[j - 1] == 2) {
624                   discards[--j] = 0; --provisional;
625                 }
626 
627                 /* Now we have the length of a run of discardable lines
628                    whose first and last are not provisional.  */
629                 length = j - i;
630 
631                 /* If 1/4 of the lines in the run are provisional,
632                    cancel discarding of all provisional lines in the run.  */
633                 if (provisional * 4 > length)
634                   {
635                     while (j > i)
636                       if (discards[--j] == 2)
637                         discards[j] = 0;
638                   }
639                 else
640                   {
641                     int consec;
642                     int minimum = 1;
643                     int tem = length / 4;
644 
645                     /* MINIMUM is approximate square root of LENGTH/4.
646                        A subrun of two or more provisionals can stand
647                        when LENGTH is at least 16.
648                        A subrun of 4 or more can stand when LENGTH >= 64.  */
649                     while ((tem = tem >> 2) > 0)
650                       minimum *= 2;
651                     minimum++;
652 
653                     /* Cancel any subrun of MINIMUM or more provisionals
654                        within the larger run.  */
655                     for (j = 0, consec = 0; j < length; j++)
656                       if (discards[i + j] != 2)
657                         consec = 0;
658                       else if (minimum == ++consec)
659                         /* Back up to start of subrun, to cancel it all.  */
660                         j -= consec;
661                       else if (minimum < consec)
662                         discards[i + j] = 0;
663 
664                     /* Scan from beginning of run
665                        until we find 3 or more nonprovisionals in a row
666                        or until the first nonprovisional at least 8 lines in.
667                        Until that point, cancel any provisionals.  */
668                     for (j = 0, consec = 0; j < length; j++)
669                       {
670                         if (j >= 8 && discards[i + j] == 1)
671                           break;
672                         if (discards[i + j] == 2) {
673                           consec = 0; discards[i + j] = 0;
674                         }
675                         else if (discards[i + j] == 0)
676                           consec = 0;
677                         else
678                           consec++;
679                         if (consec == 3)
680                           break;
681                       }
682 
683                     /* I advances to the last line of the run.  */
684                     i += length - 1;
685 
686                     /* Same thing, from end.  */
687                     for (j = 0, consec = 0; j < length; j++)
688                       {
689                         if (j >= 8 && discards[i - j] == 1)
690                           break;
691                         if (discards[i - j] == 2) {
692                           consec = 0; discards[i - j] = 0;
693                         }
694                         else if (discards[i - j] == 0)
695                           consec = 0;
696                         else
697                           consec++;
698                         if (consec == 3)
699                           break;
700                       }
701                   }
702               }
703           }
704       }
705 
706     /** Actually discard the lines.
707       @param discards flags lines to be discarded
708      */
discard(final byte[] discards)709     private void discard(final byte[] discards) {
710       final int end = buffered_lines;
711       int j = 0;
712       for (int i = 0; i < end; ++i)
713         if (no_discards || discards[i] == 0)
714           {
715             undiscarded[j] = equivs[i];
716             realindexes[j++] = i;
717           }
718         else
719           changed_flag[1+i] = true;
720       nondiscarded_lines = j;
721     }
722 
file_data(Object[] data,Hashtable h)723     file_data(Object[] data,Hashtable h) {
724       buffered_lines = data.length;
725 
726       equivs = new int[buffered_lines];
727       undiscarded = new int[buffered_lines];
728       realindexes = new int[buffered_lines];
729 
730       for (int i = 0; i < data.length; ++i) {
731         Integer ir = (Integer)h.get(data[i]);
732         if (ir == null)
733           h.put(data[i],new Integer(equivs[i] = equiv_max++));
734         else
735           equivs[i] = ir.intValue();
736       }
737     }
738 
739     /** Adjust inserts/deletes of blank lines to join changes
740        as much as possible.
741 
742        We do something when a run of changed lines include a blank
743        line at one end and have an excluded blank line at the other.
744        We are free to choose which blank line is included.
745        `compareseq' always chooses the one at the beginning,
746        but usually it is cleaner to consider the following blank line
747        to be the "change".  The only exception is if the preceding blank line
748        would join this change to other changes.
749       @param f the file being compared against
750     */
751 
shift_boundaries(file_data f)752     void shift_boundaries(file_data f) {
753       final boolean[] changed = changed_flag;
754       final boolean[] other_changed = f.changed_flag;
755       int i = 0;
756       int j = 0;
757       int i_end = buffered_lines;
758       int preceding = -1;
759       int other_preceding = -1;
760 
761       for (;;)
762         {
763           int start, end, other_start;
764 
765           /* Scan forwards to find beginning of another run of changes.
766              Also keep track of the corresponding point in the other file.  */
767 
768           while (i < i_end && !changed[1+i])
769             {
770               while (other_changed[1+j++])
771                 /* Non-corresponding lines in the other file
772                    will count as the preceding batch of changes.  */
773                 other_preceding = j;
774               i++;
775             }
776 
777           if (i == i_end)
778             break;
779 
780           start = i;
781           other_start = j;
782 
783           for (;;)
784             {
785               /* Now find the end of this run of changes.  */
786 
787               while (i < i_end && changed[1+i]) i++;
788               end = i;
789 
790               /* If the first changed line matches the following unchanged one,
791                  and this run does not follow right after a previous run,
792                  and there are no lines deleted from the other file here,
793                  then classify the first changed line as unchanged
794                  and the following line as changed in its place.  */
795 
796               /* You might ask, how could this run follow right after another?
797                  Only because the previous run was shifted here.  */
798 
799               if (end != i_end
800                   && equivs[start] == equivs[end]
801                   && !other_changed[1+j]
802                   && end != i_end
803                   && !((preceding >= 0 && start == preceding)
804                        || (other_preceding >= 0
805                            && other_start == other_preceding)))
806                 {
807                   changed[1+end++] = true;
808                   changed[1+start++] = false;
809                   ++i;
810                   /* Since one line-that-matches is now before this run
811                      instead of after, we must advance in the other file
812                      to keep in synch.  */
813                   ++j;
814                 }
815               else
816                 break;
817             }
818 
819           preceding = i;
820           other_preceding = j;
821         }
822     }
823 
824     /** Number of elements (lines) in this file. */
825     final int buffered_lines;
826 
827     /** Vector, indexed by line number, containing an equivalence code for
828        each line.  It is this vector that is actually compared with that
829        of another file to generate differences. */
830     private final int[]            equivs;
831 
832     /** Vector, like the previous one except that
833        the elements for discarded lines have been squeezed out.  */
834     final int[]           undiscarded;
835 
836     /** Vector mapping virtual line numbers (not counting discarded lines)
837        to real ones (counting those lines).  Both are origin-0.  */
838     final int[]           realindexes;
839 
840     /** Total number of nondiscarded lines. */
841     int                    nondiscarded_lines;
842 
843     /** Array, indexed by real origin-1 line number,
844        containing true for a line that is an insertion or a deletion.
845        The results of comparison are stored here.  */
846     boolean[]            changed_flag;
847 
848   }
849 
850 }
851