source: josm/trunk/src/org/openstreetmap/josm/tools/Diff.java@ 9519

Last change on this file since 9519 was 9231, checked in by Don-vip, 8 years ago

javadoc update

  • Property svn:eol-style set to native
File size: 36.7 KB
Line 
1// License: GPL. For details, see LICENSE file.
2package org.openstreetmap.josm.tools;
3
4//// Taken from http://www.bmsi.com/java/#diff
5
6// http://www.bmsi.com/java/DiffPrint.java could also be useful
7
8/*
9 * $Log: Diff.java,v $
10 * Revision 1.15 2013/04/01 16:27:31 stuart
11 * Fix DiffPrint unified format with test cases.
12 * Begin porting some diff-2.7 features.
13 *
14 * Revision 1.14 2010/03/03 21:21:25 stuart
15 * Test new direct equivalence API
16 *
17 * Revision 1.13 2009/12/07 17:43:17 stuart
18 * Compute equivMax for int[] ctor
19 *
20 * Revision 1.12 2009/12/07 17:34:46 stuart
21 * Ctor with int[].
22 *
23 * Revision 1.11 2009/11/15 01:11:54 stuart
24 * Diff doesn't need to be generic
25 *
26 * Revision 1.10 2009/11/15 00:54:03 stuart
27 * Update to Java 5 containers
28 *
29 * Revision 1.7 2009/01/19 03:05:26 stuart
30 * Fix StackOverflow bug with heuristic on reported by Jimmy Han.
31 *
32 * Revision 1.6 2003/03/06 22:51:32 stuart
33 * Convert to CVS
34 *
35 * Revision 1.5 2002/07/19 19:14:40 stuart
36 * fix reverseScript, make change ctor public, update docs
37 *
38 * Revision 1.4 2002/04/09 17:53:39 stuart
39 * More flexible interface for diff() function.
40 *
41 * Revision 1.3 2000/03/03 21:58:03 stuart
42 * move discard_confusing_lines and shift_boundaries to class file_data
43 *
44 * Revision 1.2 2000/03/02 16:37:38 stuart
45 * Add GPL and copyright
46 *
47 */
48
49import java.util.HashMap;
50import java.util.Map;
51
52/** A class to compare vectors of objects. The result of comparison
53 is a list of <code>change</code> objects which form an
54 edit script. The objects compared are traditionally lines
55 of text from two files. Comparison options such as "ignore
56 whitespace" are implemented by modifying the <code>equals</code>
57 and <code>hashcode</code> methods for the objects compared.
58<p>
59 The basic algorithm is described in: <br>
60 "An O(ND) Difference Algorithm and its Variations", Eugene Myers,
61 Algorithmica Vol. 1 No. 2, 1986, p 251.
62<p>
63 This class outputs different results from GNU diff 1.15 on some
64 inputs. Our results are actually better (smaller change list, smaller
65 total size of changes), but it would be nice to know why. Perhaps
66 there is a memory overwrite bug in GNU diff 1.15.
67
68 @author Stuart D. Gathman, translated from GNU diff 1.15
69 Copyright (C) 2000 Business Management Systems, Inc.
70<p>
71 This program is free software; you can redistribute it and/or modify
72 it under the terms of the GNU General Public License as published by
73 the Free Software Foundation; either version 1, or (at your option)
74 any later version.
75<p>
76 This program is distributed in the hope that it will be useful,
77 but WITHOUT ANY WARRANTY; without even the implied warranty of
78 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
79 GNU General Public License for more details.
80<p>
81 You should have received a copy of the <a href=COPYING.txt>
82 GNU General Public License</a>
83 along with this program; if not, write to the Free Software
84 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
85 */
86public class Diff {
87
88 /** Prepare to find differences between two arrays. Each element of
89 the arrays is translated to an "equivalence number" based on
90 the result of <code>equals</code>. The original Object arrays
91 are no longer needed for computing the differences. They will
92 be needed again later to print the results of the comparison as
93 an edit script, if desired.
94 * @param a first array
95 * @param b second array
96 */
97 public Diff(Object[] a, Object[] b) {
98 Map<Object, Integer> h = new HashMap<>(a.length + b.length);
99 filevec = new FileData[] {new FileData(a, h), new FileData(b, h)};
100 }
101
102 /** 1 more than the maximum equivalence value used for this or its
103 sibling file. */
104 private int equivMax = 1;
105
106 /** When set to true, the comparison uses a heuristic to speed it up.
107 With this heuristic, for files with a constant small density
108 of changes, the algorithm is linear in the file size. */
109 public boolean heuristic;
110
111 /** When set to true, the algorithm returns a guarranteed minimal
112 set of changes. This makes things slower, sometimes much slower. */
113 public boolean noDiscards;
114
115 private int[] xvec, yvec; /* Vectors being compared. */
116 private int[] fdiag; /* Vector, indexed by diagonal, containing
117 the X coordinate of the point furthest
118 along the given diagonal in the forward
119 search of the edit matrix. */
120 private int[] bdiag; /* Vector, indexed by diagonal, containing
121 the X coordinate of the point furthest
122 along the given diagonal in the backward
123 search of the edit matrix. */
124 private int fdiagoff, bdiagoff;
125 private final FileData[] filevec;
126 private int cost;
127 /** Snakes bigger than this are considered "big". */
128 private static final int SNAKE_LIMIT = 20;
129
130 /**
131 * Find the midpoint of the shortest edit script for a specified
132 * portion of the two files.
133 *
134 * We scan from the beginnings of the files, and simultaneously from the ends,
135 * doing a breadth-first search through the space of edit-sequence.
136 * When the two searches meet, we have found the midpoint of the shortest
137 * edit sequence.
138 *
139 * The value returned is the number of the diagonal on which the midpoint lies.
140 * The diagonal number equals the number of inserted lines minus the number
141 * of deleted lines (counting only lines before the midpoint).
142 * The edit cost is stored into COST; this is the total number of
143 * lines inserted or deleted (counting only lines before the midpoint).
144 *
145 * This function assumes that the first lines of the specified portions
146 * of the two files do not match, and likewise that the last lines do not
147 * match. The caller must trim matching lines from the beginning and end
148 * of the portions it is going to specify.
149 *
150 * Note that if we return the "wrong" diagonal value, or if
151 * the value of bdiag at that diagonal is "wrong",
152 * the worst this can do is cause suboptimal diff output.
153 * It cannot cause incorrect diff output.
154 * @param xoff xoff
155 * @param xlim xlim
156 * @param yoff yoff
157 * @param ylim ylim
158 * @return midpoint of the shortest edit script
159 */
160 private int diag(int xoff, int xlim, int yoff, int ylim) {
161 final int[] fd = fdiag; // Give the compiler a chance.
162 final int[] bd = bdiag; // Additional help for the compiler.
163 final int[] xv = xvec; // Still more help for the compiler.
164 final int[] yv = yvec; // And more and more . . .
165 final int dmin = xoff - ylim; // Minimum valid diagonal.
166 final int dmax = xlim - yoff; // Maximum valid diagonal.
167 final int fmid = xoff - yoff; // Center diagonal of top-down search.
168 final int bmid = xlim - ylim; // Center diagonal of bottom-up search.
169 int fmin = fmid, fmax = fmid; // Limits of top-down search.
170 int bmin = bmid, bmax = bmid; // Limits of bottom-up search.
171 // True if southeast corner is on an odd diagonal with respect to the northwest.
172 final boolean odd = (fmid - bmid & 1) != 0;
173
174 fd[fdiagoff + fmid] = xoff;
175 bd[bdiagoff + bmid] = xlim;
176
177 for (int c = 1;; ++c) {
178 int d; /* Active diagonal. */
179 boolean big_snake = false;
180
181 /* Extend the top-down search by an edit step in each diagonal. */
182 if (fmin > dmin) {
183 fd[fdiagoff + --fmin - 1] = -1;
184 } else {
185 ++fmin;
186 }
187 if (fmax < dmax) {
188 fd[fdiagoff + ++fmax + 1] = -1;
189 } else {
190 --fmax;
191 }
192 for (d = fmax; d >= fmin; d -= 2) {
193 int x, y, oldx, tlo = fd[fdiagoff + d - 1], thi = fd[fdiagoff + d + 1];
194
195 if (tlo >= thi) {
196 x = tlo + 1;
197 } else {
198 x = thi;
199 }
200 oldx = x;
201 y = x - d;
202 while (x < xlim && y < ylim && xv[x] == yv[y]) {
203 ++x; ++y;
204 }
205 if (x - oldx > SNAKE_LIMIT) {
206 big_snake = true;
207 }
208 fd[fdiagoff + d] = x;
209 if (odd && bmin <= d && d <= bmax && bd[bdiagoff + d] <= fd[fdiagoff + d]) {
210 cost = 2 * c - 1;
211 return d;
212 }
213 }
214
215 /* Similar extend the bottom-up search. */
216 if (bmin > dmin) {
217 bd[bdiagoff + --bmin - 1] = Integer.MAX_VALUE;
218 } else {
219 ++bmin;
220 }
221 if (bmax < dmax) {
222 bd[bdiagoff + ++bmax + 1] = Integer.MAX_VALUE;
223 } else {
224 --bmax;
225 }
226 for (d = bmax; d >= bmin; d -= 2) {
227 int x, y, oldx, tlo = bd[bdiagoff + d - 1], thi = bd[bdiagoff + d + 1];
228
229 if (tlo < thi) {
230 x = tlo;
231 } else {
232 x = thi - 1;
233 }
234 oldx = x;
235 y = x - d;
236 while (x > xoff && y > yoff && xv[x - 1] == yv[y - 1]) {
237 --x; --y;
238 }
239 if (oldx - x > SNAKE_LIMIT) {
240 big_snake = true;
241 }
242 bd[bdiagoff + d] = x;
243 if (!odd && fmin <= d && d <= fmax && bd[bdiagoff + d] <= fd[fdiagoff + d]) {
244 cost = 2 * c;
245 return d;
246 }
247 }
248
249 /* Heuristic: check occasionally for a diagonal that has made
250 lots of progress compared with the edit distance.
251 If we have any such, find the one that has made the most
252 progress and return it as if it had succeeded.
253
254 With this heuristic, for files with a constant small density
255 of changes, the algorithm is linear in the file size. */
256
257 if (c > 200 && big_snake && heuristic) {
258 int best = 0;
259 int bestpos = -1;
260
261 for (d = fmax; d >= fmin; d -= 2) {
262 int dd = d - fmid;
263 int x = fd[fdiagoff + d];
264 int y = x - d;
265 int v = (x - xoff) * 2 - dd;
266 if (v > 12 * (c + (dd < 0 ? -dd : dd))) {
267 if (v > best
268 && xoff + SNAKE_LIMIT <= x && x < xlim
269 && yoff + SNAKE_LIMIT <= y && y < ylim) {
270 /* We have a good enough best diagonal.
271 now insist that it end with a significant snake. */
272 int k;
273
274 for (k = 1; xvec[x - k] == yvec[y - k]; k++) {
275 if (k == SNAKE_LIMIT) {
276 best = v;
277 bestpos = d;
278 break;
279 }
280 }
281 }
282 }
283 }
284 if (best > 0) {
285 cost = 2 * c - 1;
286 return bestpos;
287 }
288
289 best = 0;
290 for (d = bmax; d >= bmin; d -= 2) {
291 int dd = d - bmid;
292 int x = bd[bdiagoff + d];
293 int y = x - d;
294 int v = (xlim - x) * 2 + dd;
295 if (v > 12 * (c + (dd < 0 ? -dd : dd))) {
296 if (v > best
297 && xoff < x && x <= xlim - SNAKE_LIMIT
298 && yoff < y && y <= ylim - SNAKE_LIMIT) {
299 /* We have a good enough best diagonal.
300 now insist that it end with a significant snake. */
301 int k;
302
303 for (k = 0; xvec[x + k] == yvec[y + k]; k++) {
304 if (k == SNAKE_LIMIT) {
305 best = v;
306 bestpos = d;
307 break;
308 }
309 }
310 }
311 }
312 }
313 if (best > 0) {
314 cost = 2 * c - 1;
315 return bestpos;
316 }
317 }
318 }
319 }
320
321 /**
322 * Compare in detail contiguous subsequences of the two files
323 * which are known, as a whole, to match each other.
324 *
325 * The results are recorded in the vectors filevec[N].changed_flag, by
326 * storing a 1 in the element for each line that is an insertion or deletion.
327 *
328 * The subsequence of file 0 is [XOFF, XLIM) and likewise for file 1.
329 *
330 * Note that XLIM, YLIM are exclusive bounds.
331 * All line numbers are origin-0 and discarded lines are not counted.
332 * @param xoff xoff
333 * @param xlim xlim
334 * @param yoff yoff
335 * @param ylim ylim
336 */
337 private void compareseq(int xoff, int xlim, int yoff, int ylim) {
338 /* Slide down the bottom initial diagonal. */
339 while (xoff < xlim && yoff < ylim && xvec[xoff] == yvec[yoff]) {
340 ++xoff; ++yoff;
341 }
342 /* Slide up the top initial diagonal. */
343 while (xlim > xoff && ylim > yoff && xvec[xlim - 1] == yvec[ylim - 1]) {
344 --xlim; --ylim;
345 }
346
347 /* Handle simple cases. */
348 if (xoff == xlim) {
349 while (yoff < ylim) {
350 filevec[1].changedFlag[1+filevec[1].realindexes[yoff++]] = true;
351 }
352 } else if (yoff == ylim) {
353 while (xoff < xlim) {
354 filevec[0].changedFlag[1+filevec[0].realindexes[xoff++]] = true;
355 }
356 } else {
357 /* Find a point of correspondence in the middle of the files. */
358
359 int d = diag(xoff, xlim, yoff, ylim);
360 int c = cost;
361 int b = bdiag[bdiagoff + d];
362
363 if (c == 1)
364 /* This should be impossible, because it implies that
365 one of the two subsequences is empty,
366 and that case was handled above without calling `diag'.
367 Let's verify that this is true. */
368 throw new IllegalArgumentException("Empty subsequence");
369 else {
370 /* Use that point to split this problem into two subproblems. */
371 compareseq(xoff, b, yoff, b - d);
372 /* This used to use f instead of b,
373 but that is incorrect!
374 It is not necessarily the case that diagonal d
375 has a snake from b to f. */
376 compareseq(b, xlim, b - d, ylim);
377 }
378 }
379 }
380
381 /** Discard lines from one file that have no matches in the other file.
382 */
383 private void discard_confusing_lines() {
384 filevec[0].discard_confusing_lines(filevec[1]);
385 filevec[1].discard_confusing_lines(filevec[0]);
386 }
387
388 private boolean inhibit;
389
390 /**
391 * Adjust inserts/deletes of blank lines to join changes as much as possible.
392 */
393 private void shift_boundaries() {
394 if (inhibit)
395 return;
396 filevec[0].shift_boundaries(filevec[1]);
397 filevec[1].shift_boundaries(filevec[0]);
398 }
399
400 /**
401 * Script builder.
402 */
403 public interface ScriptBuilder {
404 /**
405 * Scan the tables of which lines are inserted and deleted, producing an edit script.
406 * @param changed0 true for lines in first file which do not match 2nd
407 * @param len0 number of lines in first file
408 * @param changed1 true for lines in 2nd file which do not match 1st
409 * @param len1 number of lines in 2nd file
410 * @return a linked list of changes - or null
411 */
412 Change build_script(
413 boolean[] changed0, int len0,
414 boolean[] changed1, int len1
415 );
416 }
417
418 /** Scan the tables of which lines are inserted and deleted,
419 producing an edit script in reverse order. */
420
421 static class ReverseScript implements ScriptBuilder {
422 @Override
423 public Change build_script(
424 final boolean[] changed0, int len0,
425 final boolean[] changed1, int len1) {
426 Change script = null;
427 int i0 = 0, i1 = 0;
428 while (i0 < len0 || i1 < len1) {
429 if (changed0[1+i0] || changed1[1+i1]) {
430 int line0 = i0, line1 = i1;
431
432 /* Find # lines changed here in each file. */
433 while (changed0[1+i0]) {
434 ++i0;
435 }
436 while (changed1[1+i1]) {
437 ++i1;
438 }
439
440 /* Record this change. */
441 script = new Change(line0, line1, i0 - line0, i1 - line1, script);
442 }
443
444 /* We have reached lines in the two files that match each other. */
445 i0++; i1++;
446 }
447
448 return script;
449 }
450 }
451
452 static class ForwardScript implements ScriptBuilder {
453 /** Scan the tables of which lines are inserted and deleted,
454 producing an edit script in forward order. */
455 @Override
456 public Change build_script(
457 final boolean[] changed0, int len0,
458 final boolean[] changed1, int len1) {
459 Change script = null;
460 int i0 = len0, i1 = len1;
461
462 while (i0 >= 0 || i1 >= 0) {
463 if (changed0[i0] || changed1[i1]) {
464 int line0 = i0, line1 = i1;
465
466 /* Find # lines changed here in each file. */
467 while (changed0[i0]) {
468 --i0;
469 }
470 while (changed1[i1]) {
471 --i1;
472 }
473
474 /* Record this change. */
475 script = new Change(i0, i1, line0 - i0, line1 - i1, script);
476 }
477
478 /* We have reached lines in the two files that match each other. */
479 i0--; i1--;
480 }
481
482 return script;
483 }
484 }
485
486 /** Standard Forward ScriptBuilder. */
487 public static final ScriptBuilder forwardScript = new ForwardScript();
488 /** Standard Reverse ScriptBuilder. */
489 public static final ScriptBuilder reverseScript = new ReverseScript();
490
491 /**
492 * Report the differences of two files. DEPTH is the current directory depth.
493 * @param reverse if {@code true} use {@link #reverseScript} else use {@link #forwardScript}
494 * @return the differences of two files
495 */
496 public final Change diff_2(final boolean reverse) {
497 return diff(reverse ? reverseScript : forwardScript);
498 }
499
500 /**
501 * Get the results of comparison as an edit script. The script
502 * is described by a list of changes. The standard ScriptBuilder
503 * implementations provide for forward and reverse edit scripts.
504 * Alternate implementations could, for instance, list common elements
505 * instead of differences.
506 * @param bld an object to build the script from change flags
507 * @return the head of a list of changes
508 */
509 public Change diff(final ScriptBuilder bld) {
510
511 // Some lines are obviously insertions or deletions because they don't match anything.
512 // Detect them now, and avoid even thinking about them in the main comparison algorithm.
513 discard_confusing_lines();
514
515 // Now do the main comparison algorithm, considering just the undiscarded lines.
516 xvec = filevec[0].undiscarded;
517 yvec = filevec[1].undiscarded;
518
519 int diags = filevec[0].nondiscardedLines + filevec[1].nondiscardedLines + 3;
520 fdiag = new int[diags];
521 fdiagoff = filevec[1].nondiscardedLines + 1;
522 bdiag = new int[diags];
523 bdiagoff = filevec[1].nondiscardedLines + 1;
524
525 compareseq(0, filevec[0].nondiscardedLines,
526 0, filevec[1].nondiscardedLines);
527 fdiag = null;
528 bdiag = null;
529
530 // Modify the results slightly to make them prettier in cases where that can validly be done.
531 shift_boundaries();
532
533 // Get the results of comparison in the form of a chain of `struct change's -- an edit script.
534 return bld.build_script(
535 filevec[0].changedFlag,
536 filevec[0].bufferedLines,
537 filevec[1].changedFlag,
538 filevec[1].bufferedLines
539 );
540 }
541
542 /** The result of comparison is an "edit script": a chain of change objects.
543 Each change represents one place where some lines are deleted
544 and some are inserted.
545
546 LINE0 and LINE1 are the first affected lines in the two files (origin 0).
547 DELETED is the number of lines deleted here from file 0.
548 INSERTED is the number of lines inserted here in file 1.
549
550 If DELETED is 0 then LINE0 is the number of the line before
551 which the insertion was done; vice versa for INSERTED and LINE1. */
552
553 public static class Change {
554 /** Previous or next edit command. */
555 public Change link;
556 /** # lines of file 1 changed here. */
557 public final int inserted;
558 /** # lines of file 0 changed here. */
559 public final int deleted;
560 /** Line number of 1st deleted line. */
561 public final int line0;
562 /** Line number of 1st inserted line. */
563 public final int line1;
564
565 /**
566 * Cons an additional entry onto the front of an edit script OLD.
567 * LINE0 and LINE1 are the first affected lines in the two files (origin 0).
568 * DELETED is the number of lines deleted here from file 0.
569 * INSERTED is the number of lines inserted here in file 1.
570 *
571 * If DELETED is 0 then LINE0 is the number of the line before
572 * which the insertion was done; vice versa for INSERTED and LINE1.
573 * @param line0 first affected lines in the two files (origin 0)
574 * @param line1 first affected lines in the two files (origin 0)
575 * @param deleted the number of lines deleted here from file 0
576 * @param inserted the number of lines inserted here in file 1
577 * @param old edit script
578 */
579 public Change(int line0, int line1, int deleted, int inserted, Change old) {
580 this.line0 = line0;
581 this.line1 = line1;
582 this.inserted = inserted;
583 this.deleted = deleted;
584 this.link = old;
585 }
586
587 /**
588 * Returns the number of insertions and deletions of this change as well as
589 * (recursively) the changes linked via {@link #link}.
590 * @return recursive number of insertions and deletions
591 */
592 public int getTotalNumberOfChanges() {
593 return inserted + deleted + (link != null ? link.getTotalNumberOfChanges() : 0);
594 }
595
596 @Override
597 public String toString() {
598 String s = String.format("%d -%d +%d %d", line0, deleted, inserted, line1);
599 return (link != null) ? s = s + '\n' + link : s;
600 }
601 }
602
603 /**
604 * Data on one input file being compared.
605 */
606 class FileData {
607
608 /** Allocate changed array for the results of comparison. */
609 void clear() {
610 // Allocate a flag for each line of each file, saying whether that line is an insertion or deletion.
611 // Allocate an extra element, always zero, at each end of each vector.
612 changedFlag = new boolean[bufferedLines + 2];
613 }
614
615 /**
616 * Return equiv_count[I] as the number of lines in this file that fall in equivalence class I.
617 * @return the array of equivalence class counts.
618 */
619 int[] equivCount() {
620 int[] equiv_count = new int[equivMax];
621 for (int i = 0; i < bufferedLines; ++i) {
622 ++equiv_count[equivs[i]];
623 }
624 return equiv_count;
625 }
626
627 /**
628 * Discard lines that have no matches in another file.
629 *
630 * A line which is discarded will not be considered by the actual comparison algorithm;
631 * it will be as if that line were not in the file.
632 * The file's `realindexes' table maps virtual line numbers
633 * (which don't count the discarded lines) into real line numbers;
634 * this is how the actual comparison algorithm produces results
635 * that are comprehensible when the discarded lines are counted.
636 * <p>
637 * When we discard a line, we also mark it as a deletion or insertion so that it will be printed in the output.
638 * @param f the other file
639 */
640 void discard_confusing_lines(FileData f) {
641 clear();
642 // Set up table of which lines are going to be discarded.
643 final byte[] discarded = discardable(f.equivCount());
644
645 // Don't really discard the provisional lines except when they occur in a run of discardables,
646 // with nonprovisionals at the beginning and end.
647 filterDiscards(discarded);
648
649 // Actually discard the lines.
650 discard(discarded);
651 }
652
653 /**
654 * Mark to be discarded each line that matches no line of another file.
655 * If a line matches many lines, mark it as provisionally discardable.
656 * @param counts The count of each equivalence number for the other file.
657 * @return 0=nondiscardable, 1=discardable or 2=provisionally discardable for each line
658 * @see #equivCount()
659 */
660 private byte[] discardable(final int[] counts) {
661 final int end = bufferedLines;
662 final byte[] discards = new byte[end];
663 final int[] equivs = this.equivs;
664 int many = 5;
665 int tem = end / 64;
666
667 /* Multiply MANY by approximate square root of number of lines.
668 That is the threshold for provisionally discardable lines. */
669 while ((tem = tem >> 2) > 0) {
670 many *= 2;
671 }
672
673 for (int i = 0; i < end; i++) {
674 int nmatch;
675 if (equivs[i] == 0) {
676 continue;
677 }
678 nmatch = counts[equivs[i]];
679 if (nmatch == 0) {
680 discards[i] = 1;
681 } else if (nmatch > many) {
682 discards[i] = 2;
683 }
684 }
685 return discards;
686 }
687
688 /**
689 * Don't really discard the provisional lines except when they occur
690 * in a run of discardables, with nonprovisionals at the beginning and end.
691 * @param discards discards
692 */
693 private void filterDiscards(final byte[] discards) {
694 final int end = bufferedLines;
695
696 for (int i = 0; i < end; i++) {
697 /* Cancel provisional discards not in middle of run of discards. */
698 if (discards[i] == 2) {
699 discards[i] = 0;
700 } else if (discards[i] != 0) {
701 /* We have found a nonprovisional discard. */
702 int j;
703 int length;
704 int provisional = 0;
705
706 /* Find end of this run of discardable lines.
707 Count how many are provisionally discardable. */
708 for (j = i; j < end; j++) {
709 if (discards[j] == 0) {
710 break;
711 }
712 if (discards[j] == 2) {
713 ++provisional;
714 }
715 }
716
717 /* Cancel provisional discards at end, and shrink the run. */
718 while (j > i && discards[j - 1] == 2) {
719 discards[--j] = 0; --provisional;
720 }
721
722 /* Now we have the length of a run of discardable lines
723 whose first and last are not provisional. */
724 length = j - i;
725
726 /* If 1/4 of the lines in the run are provisional,
727 cancel discarding of all provisional lines in the run. */
728 if (provisional * 4 > length) {
729 while (j > i)
730 if (discards[--j] == 2) {
731 discards[j] = 0;
732 }
733 } else {
734 int consec;
735 int minimum = 1;
736 int tem = length / 4;
737
738 /* MINIMUM is approximate square root of LENGTH/4.
739 A subrun of two or more provisionals can stand
740 when LENGTH is at least 16.
741 A subrun of 4 or more can stand when LENGTH >= 64. */
742 while ((tem = tem >> 2) > 0) {
743 minimum *= 2;
744 }
745 minimum++;
746
747 /* Cancel any subrun of MINIMUM or more provisionals
748 within the larger run. */
749 for (j = 0, consec = 0; j < length; j++) {
750 if (discards[i + j] != 2) {
751 consec = 0;
752 } else if (minimum == ++consec) {
753 /* Back up to start of subrun, to cancel it all. */
754 j -= consec;
755 } else if (minimum < consec) {
756 discards[i + j] = 0;
757 }
758 }
759
760 /* Scan from beginning of run
761 until we find 3 or more nonprovisionals in a row
762 or until the first nonprovisional at least 8 lines in.
763 Until that point, cancel any provisionals. */
764 for (j = 0, consec = 0; j < length; j++) {
765 if (j >= 8 && discards[i + j] == 1) {
766 break;
767 }
768 if (discards[i + j] == 2) {
769 consec = 0; discards[i + j] = 0;
770 } else if (discards[i + j] == 0) {
771 consec = 0;
772 } else {
773 consec++;
774 }
775 if (consec == 3) {
776 break;
777 }
778 }
779
780 /* I advances to the last line of the run. */
781 i += length - 1;
782
783 /* Same thing, from end. */
784 for (j = 0, consec = 0; j < length; j++) {
785 if (j >= 8 && discards[i - j] == 1) {
786 break;
787 }
788 if (discards[i - j] == 2) {
789 consec = 0; discards[i - j] = 0;
790 } else if (discards[i - j] == 0) {
791 consec = 0;
792 } else {
793 consec++;
794 }
795 if (consec == 3) {
796 break;
797 }
798 }
799 }
800 }
801 }
802 }
803
804 /** Actually discard the lines.
805 @param discards flags lines to be discarded
806 */
807 private void discard(final byte[] discards) {
808 final int end = bufferedLines;
809 int j = 0;
810 for (int i = 0; i < end; ++i) {
811 if (noDiscards || discards[i] == 0) {
812 undiscarded[j] = equivs[i];
813 realindexes[j++] = i;
814 } else {
815 changedFlag[1+i] = true;
816 }
817 }
818 nondiscardedLines = j;
819 }
820
821 FileData(int length) {
822 bufferedLines = length;
823 equivs = new int[length];
824 undiscarded = new int[bufferedLines];
825 realindexes = new int[bufferedLines];
826 }
827
828 FileData(Object[] data, Map<Object, Integer> h) {
829 this(data.length);
830 // FIXME: diff 2.7 removes common prefix and common suffix
831 for (int i = 0; i < data.length; ++i) {
832 Integer ir = h.get(data[i]);
833 if (ir == null) {
834 h.put(data[i], equivs[i] = equivMax++);
835 } else {
836 equivs[i] = ir.intValue();
837 }
838 }
839 }
840
841 /** Adjust inserts/deletes of blank lines to join changes
842 as much as possible.
843
844 We do something when a run of changed lines include a blank
845 line at one end and have an excluded blank line at the other.
846 We are free to choose which blank line is included.
847 `compareseq' always chooses the one at the beginning,
848 but usually it is cleaner to consider the following blank line
849 to be the "change". The only exception is if the preceding blank line
850 would join this change to other changes.
851 @param f the file being compared against
852 */
853 void shift_boundaries(FileData f) {
854 final boolean[] changed = changedFlag;
855 final boolean[] other_changed = f.changedFlag;
856 int i = 0;
857 int j = 0;
858 int i_end = bufferedLines;
859 int preceding = -1;
860 int other_preceding = -1;
861
862 for (;;) {
863 int start, end, other_start;
864
865 /* Scan forwards to find beginning of another run of changes.
866 Also keep track of the corresponding point in the other file. */
867
868 while (i < i_end && !changed[1+i]) {
869 while (other_changed[1+j++]) {
870 /* Non-corresponding lines in the other file
871 will count as the preceding batch of changes. */
872 other_preceding = j;
873 }
874 i++;
875 }
876
877 if (i == i_end) {
878 break;
879 }
880
881 start = i;
882 other_start = j;
883
884 for (;;) {
885 /* Now find the end of this run of changes. */
886
887 while (i < i_end && changed[1+i]) {
888 i++;
889 }
890 end = i;
891
892 /* If the first changed line matches the following unchanged one,
893 and this run does not follow right after a previous run,
894 and there are no lines deleted from the other file here,
895 then classify the first changed line as unchanged
896 and the following line as changed in its place. */
897
898 /* You might ask, how could this run follow right after another?
899 Only because the previous run was shifted here. */
900
901 if (end != i_end && equivs[start] == equivs[end] && !other_changed[1+j]
902 && !((preceding >= 0 && start == preceding) || (other_preceding >= 0 && other_start == other_preceding))) {
903 changed[1+end++] = true;
904 changed[1+start++] = false;
905 ++i;
906 /* Since one line-that-matches is now before this run
907 instead of after, we must advance in the other file
908 to keep in synch. */
909 ++j;
910 } else {
911 break;
912 }
913 }
914
915 preceding = i;
916 other_preceding = j;
917 }
918 }
919
920 /** Number of elements (lines) in this file. */
921 private final int bufferedLines;
922
923 /** Vector, indexed by line number, containing an equivalence code for
924 each line. It is this vector that is actually compared with that
925 of another file to generate differences. */
926 private final int[] equivs;
927
928 /** Vector, like the previous one except that
929 the elements for discarded lines have been squeezed out. */
930 private final int[] undiscarded;
931
932 /** Vector mapping virtual line numbers (not counting discarded lines)
933 to real ones (counting those lines). Both are origin-0. */
934 private final int[] realindexes;
935
936 /** Total number of nondiscarded lines. */
937 private int nondiscardedLines;
938
939 /** Array, indexed by real origin-1 line number,
940 containing true for a line that is an insertion or a deletion.
941 The results of comparison are stored here. */
942 private boolean[] changedFlag;
943 }
944}
Note: See TracBrowser for help on using the repository browser.